In [16]:
%matplotlib inline
# notebook
import matplotlib.pylab as pylab
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np

#make the graphs bigger
pylab.rcParams['figure.figsize'] = (32.0, 24.0)
pylab.rcParams['font.size'] = 24

In [17]:
cohort = []
ETOH = []
hw = []
labOrders = []
medOrders = []
prescriptions = []
radiology = []
tobacco = []
vitals = []

In [18]:
def fileToList(fname):
    dataArray = []
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                dataArray.append(new)
    return dataArray

In [19]:
def fileToDict(fname):
    dataDict = dict()
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                if not(new[0] in dataDict):
                    dataDict[new[0]] = [new]
                else:
                    dataDict[new[0]].append(new)
    return dataDict

In [20]:
fname = "2018-4169_Cohort.txt"
cohort = fileToList(fname)

fname = "2018-4169_ETOH_Use.txt"
ETOH = fileToDict(fname)

fname = "2018-4169_Height_Weight.txt"
hw = fileToDict(fname)

fname = "2018-4169_Lab_Orders_Performed.txt"
labOrders = fileToDict(fname)


In [21]:
fname = "2018-4169_Med_Orders.txt"
medOrders = fileToDict(fname)

fname = "2018-4169_Prescriptions.txt"
prescriptions = fileToList(fname)

In [22]:
fname = "2018-4169_Tobacco_Use.txt"
tobacco = fileToDict(fname)

In [23]:
fname = "2018-4169_ETOH_Use.txt"
ETOHList = fileToList(fname)

In [24]:
tobList = fileToList(fname)

In [25]:
#organises data into a dict of dict of lists
#dict of years
    #dict of patients
        #entry of admissions
def organiseCohortDataIndex(myCohort):
    organised = dict()
    for admission in myCohort:
        year = admission[8][0:4]
        if year in organised:
            entry = organised[year]
            if admission[0] in entry:
                organised[year][admission[0]].append(admission[1:])
            else:
                organised[year][admission[0]] = [admission[1:]]
        else:
            patientData = dict()
            patientData[admission[0]] = [admission[1:]]
            organised[year] = patientData
    return organised

#gets non frequent flyers from the organised cohort dict above
def getNonFrequentFlyersBase(cohortDict, frequentFlyerDict):
    regular = dict()
    for year in cohortDict:
        regular[year] = dict()
        patList = cohortDict[year]
        for pKey in patList:
            if not(pKey in frequentFlyerDict[year]):
                regular[year][pKey] = cohortDict[year][pKey]
    return regular

#gets frequent flyers from the organised cohort dict above (defined by greater than or equal to a specified visit num)
def getFrequentFlyersExtended(dataDict, year=None, visits=10):
    if (year == None):
        freqDict = dict()
        for key in dataDict:
            freqDict[key] = []
            patientDict = dataDict[key]
            for pKey in patientDict:
                if len(patientDict[pKey]) >= visits:
                    freqDict[key].append(pKey)
        return freqDict
    else:
        freqDict= dict()
        if (year in dataDict):
            patientDict = dataDict[year]
            for pKey in patientDict:
                if len(patientDict[pKey]) >= visits:
                    freqDict[key].append(pKey)
            return freqDict
        else:
            return dict()

In [26]:
import collections
from datetime import datetime

#returns a dict of lists
    #dict of patients
        #list of admissions
def organiseCohortData(myCohort):
    organised = dict()
    for admission in myCohort:
        if admission[0] in organised:
            organised[admission[0]].append(admission[1:])
        else:
            organised[admission[0]] = [admission[1:]]
            
    
    for p in organised.keys():
        organised[p] = sorted(organised[p], key = lambda x: datetime.strptime(x[8][0:10], '%Y-%m-%d'))
    return organised
    

In [27]:
organised = organiseCohortData(cohort[1:])

In [28]:
from datetime import datetime, timedelta

#method for finding frequent flyers where you specify a gap/time frame in days(e.g. 12 months, one month, one week etc)
#and a number of visits, and this method will get frequent flyers for that definition
#uses the organisation of the cohort data specified by the organiseCohortData method (aka dict of lists)
def abstractFreqFlyers(organised, gap, visits):
    frequentFlyers = []
    for p in organised.keys():
        dates = [datetime.strptime(item[8][0:10], '%Y-%m-%d') for item in organised[p]]
        try:
            for i in range(len(dates)):
                modified_date = dates[i] + timedelta(days=gap)
                if dates[i+visits-1] <= modified_date:
                    frequentFlyers.append(p)
                    break
        except:
            pass
    return frequentFlyers

In [29]:
frequents = abstractFreqFlyers(organised, 365,20) #as a test, this calculates frequent flyers that, in the space of a year, have made 10 visits

In [30]:
print(len(frequents))

126


In [31]:
print(frequents[0:5])

['0.54629537974305198', '0.18454035455437792', '0.49137990165585754', '0.31499293513474674', '7.0830591556572897E-2']


In [32]:
#def difference(l1,l2):
    #precisionError = len([b for a,b in zip(l1,l2) if b != a and b == 0 and a == 1])
    #recallError = len([b for a,b in zip(l1,l2) if b != a and b == 1 and a == 0])
    #print("Precision Error is " + str(precisionError))
    #print("Recall Error is " + str(recallError))
    #return (precisionError) + (recallError)
    
def difference(l1,l2):
    return len([b for a,b in zip(l1,l2) if b != a])  

In [70]:
#given a set of data training points (Xtr) and classes (patientBase), run a number of models 
#use a K-fold algorithm to get error across 4 runs overall
def runModels(Xtr, patientBase):
    kf = KFold(n_splits=5, shuffle=True)
    Ytr = patientBase
    differencesLR = []
    differencesSVM = []
    differencesKM = []

    for train_index, test_index in kf.split(Xtr):
        X_train, X_test = [Xtr[i] for i in train_index], [Xtr[i] for i in test_index]
        y_train, y_test = [Ytr[i] for i in train_index], [Ytr[i] for i in test_index]
    
        if (1 in y_train and 0 in y_train):
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            PredLR = lr.predict(X_test)
            differencesLR.append(difference(PredLR, y_test)/len(y_test))
    
            km = KMeans(n_clusters=2)
            km.fit(X_train, y_train)
            PredKM = km.predict(X_test)
            differencesKM.append(difference(PredKM, y_test)/len(y_test))

            clf = svm.SVC()
            clf.fit(X_train, y_train)
            PredSVM = clf.predict(X_test)
            differencesSVM.append(difference(PredSVM, y_test)/len(y_test))
    print((differencesLR))
    print((differencesSVM))
    print((differencesKM))

In [34]:
#given a sentence, is the patient a serious smoker
def isBadSmoking(res):
    if (res == "current every day smoker" or res == "current some day smoker" or res == "current every day" or res=="heavy tobacco smoker"):
        return 1
    else:
        return 0
    
#given a sentence, is the patient a big drinker
def isBad(cat,res):
    if (cat == "Number of Standard Drinks on a Typical Day"):
        if (res == "10 or more" or res == "7 to 9"):
            return 1
        else:
            return 0
    elif (cat == "Alcohol Treatment Offered"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "Excessive Alcohol and Drug Use"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "10.Has a Relative, Friend, Doctor, or Other Health Professional Expressed Concern About Your Drinking or Suggested You Cut Down"):
        if (res == "(2) Yes, during the last year"):
            return 1
        else:
            return 0
    
    return 0

#given a list of frequent flyer ids, return 3 lists: 
#first list = list of 1s the size of frequents (= list of classes for our model)
#list of 1s and 0s corresponding to whether the patient drinks a lot of alcohol (ETOH results)
#list of 1s and 0s corresponding to whether the patient smokes a lot (tobacco results)

def frequentsArray(frequents, value):
    smokesHeavyFreqs = []
    ETOHHeavyFreqs = []
    patientBase = []
    
    for p in frequents:
        patientBase.append(value)
        if p in ETOH:
            entry = ETOH[p]
            drinkFreq = isBad(entry[0][4], entry[0][5])
            ETOHHeavyFreqs.append(drinkFreq)
        else:
            ETOHHeavyFreqs.append((value-1)*-1)
        if p in tobacco:
            entry = tobacco[p]
            smokingFreq = isBadSmoking(entry[0][2])
            smokesHeavyFreqs.append(smokingFreq)
        else:
            smokesHeavyFreqs.append(0)
                
    return (patientBase, ETOHHeavyFreqs, smokesHeavyFreqs)

In [35]:
#given a list of ids, returns a list where it is 1 if patient is diabetic, and 0 if not
def getDiabetes(deIds):
    diabetes = []
    for patient in deIds:
        if patient in organised:
            entry = organised[patient][0]
            if (entry[-1] == "Diabetes"):
                diabetes.append(1)
            else:
                diabetes.append(0)
    return diabetes

In [36]:
#calculates Xtr and Ytr and runs models 
def runCode(frequents, regs):
    patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArray(frequents, 1)
    patBase2, ETOHHeavyRegs, smokesHeavyRegs = frequentsArray(regs, 0)

    patientBase = patBase1 + patBase2
    totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
    ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs
    DiabetesTotal = getDiabetes(frequents) + getDiabetes(regs)
    PrescriptionsTotal = prescIndicators(frequents) + prescIndicators(regs)

    Xtr = list(map(list,zip(totalSmokes, ETOHTotal)))
    print("Models on ETOH and Tobacco")
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco and Diabetes")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal, PrescriptionsTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on Prescriptions")
    Xtr = list(map(list,zip(PrescriptionsTotal)))
    runModels(Xtr, patientBase)

In [37]:
#same as above but this time, the frequents and regs parameters are dicts (extra dimension of year), not lists
def runCodeDicts(frequents, regs):
    patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArrayOrg(frequentsBase)
    patBase2, ETOHHeavyRegs, smokesHeavyRegs = regsArrayOrg(regularsBase)

    patientBase = patBase1 + patBase2
    totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
    ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs
    
    listFreqs = []
    for f in frequents.keys():
        listFreqs = listFreqs + frequents[f]

    listFreqs = list(set(listFreqs))
    DiabetesTotal = getDiabetes(listFreqs) + getDiabetes(regs)
    PrescriptionsTotal = prescIndicators(listFreqs) + prescIndicators(regs)

    Xtr = list(map(list,zip(totalSmokes, ETOHTotal)))
    print("Models on ETOH and Tobacco")
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco and Diabetes")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal, PrescriptionsTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(PrescriptionsTotal)))
    runModels(Xtr, patientBase)

In [38]:
from collections import Counter

names = [item[3] for item in prescriptions]
organisedPrescriptions = fileToDict("2018-4169_Prescriptions.txt")
ordered = Counter(names[1:])
orderedPrescList = list(ordered.keys())

In [64]:
def retrieveRegulars(org, frequents):
    regularsBase = []
    newOrder = Counter(frequents)
    for k in org.keys():
        if not(k in newOrder):
            regularsBase.append(k)
    
    return list(np.random.choice(regularsBase, len(frequents)))

In [47]:
def runPrescriptions(frequents, regs, top=orderedPrescList[0:20]):
    
    medList = []
    for med in top:
        counter = 0
        for f in frequents:
            if (f in organisedPrescriptions):
                if (med in [item[3] for item in organisedPrescriptions[f]]):
                    counter = counter + 1
        medList.append(counter)
    
    medList2 = []
    for med in top:
        counter = 0
        for f in regs:
            if (f in organisedPrescriptions):
                if (med in [item[3] for item in organisedPrescriptions[f]]):
                    counter = counter + 1
        medList2.append(counter)
    
    x = ([(item / len(frequents)) for item in medList])
    y = ([(item / len(regs)) for item in medList2])

    x1 = np.array(x)
    y1 = np.array(y)

    return(x1 > y1, x, y)

In [41]:
indicatorIndices = [2, 8, 16, 19]
indicators = [orderedPrescList[k] for k in indicatorIndices]
print(indicators)

['citalopram', 'chlordiazePOXIDE', 'QUEtiapine', 'mirtazapine']


In [42]:
def prescIndicators(deIds):
    prescs = []
    for patient in deIds:
        if patient in organisedPrescriptions:
            entry = organisedPrescriptions[patient]
            data = [item[3] for item in entry]
            if (not set(data).isdisjoint(indicators)):
                prescs.append(1)
            else:
                prescs.append(0)
    return prescs

# N or more during indexed year

Follwing code blocks deal with the case where the definion of a frequent flyer is N visits per index year (2018, 1017, 2016 etc)

In [43]:
#specialised routines that get the ETOH,tobacco, and patientBase lists for dict base data 
#(aka is same as frequentsAray above, but works on dicts)
def frequentsArrayOrg(frequents):
    patListFreqs = []
    smokesHeavyFreqs = []
    ETOHHeavyFreqs = []
    patientBase = []
    
    for k in frequents.keys():
        patListFreqs = patListFreqs + frequents[k]
        for p in patListFreqs:
            patientBase.append(1)
            if p in ETOH:
                entry = ETOH[p]
                drinkFreq = isBad(entry[0][4], entry[0][5])
                ETOHHeavyFreqs.append(drinkFreq)
            else:
                ETOHHeavyFreqs.append(0)
            
            if p in tobacco:
                entry = tobacco[p]
                smokingFreq = isBadSmoking(entry[0][2])
                smokesHeavyFreqs.append(smokingFreq)
            else:
                smokesHeavyFreqs.append(0)
    return (patientBase, ETOHHeavyFreqs, smokesHeavyFreqs)

def regsArrayOrg(regulars):
    patListRegs = []
    smokesHeavyRegs = []
    ETOHHeavyRegs = []
    patientBase = []

    for k in regulars.keys():
        patListRegs = patListRegs + list(regulars[k].keys())

    for p in patListRegs:
        patientBase.append(0)

        if p in ETOH:
            entry = ETOH[p]
            drinkFreq = isBad(entry[0][4], entry[0][5])
            ETOHHeavyRegs.append(drinkFreq)
        else:
            ETOHHeavyRegs.append(0)
        
        if p in tobacco:
            entry = tobacco[p]
            smokingFreq = isBadSmoking(entry[0][2])
            smokesHeavyRegs.append(smokingFreq)
        else:
            smokesHeavyRegs.append(0)
    
    return (patientBase, ETOHHeavyRegs, smokesHeavyRegs)

In [44]:
organisedIndex = organiseCohortDataIndex(cohort[1:])

In [45]:
frequentsBase = getFrequentFlyersExtended(organisedIndex, visits=4)
regularsBase = getNonFrequentFlyersBase(organisedIndex, frequentsBase)
runCodeDicts(frequentsBase, regularsBase)

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.1998753807809471, 0.1637626641434539, 0.19459509358196211, 0.15889317117075535]
----------------------
Models on ETOH, Tobacco and Diabetes
[]
[]
[]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[]
[]
[]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[]
[]
[]


In [48]:
listFreqs = []
for f in frequentsBase.keys():
    listFreqs = listFreqs + frequentsBase[f]
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

(array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False], dtype=bool), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
(array([False, False, False, False], dtype=bool), [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0])


# N visits 2 years after initial visit

Deals with the definiton of frequent flyers that defines a frequent flyer as returning N times within a space of 2 years after a given visit

In [49]:
organised = organiseCohortData(cohort[1:])

In [50]:
freqs = abstractFreqFlyers(organised, 730, 10)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.021015761821366025, 0.008756567425569177, 0.0456140350877193, 0.08771929824561403, 0.09298245614035087]
[0.021015761821366025, 0.008756567425569177, 0.0456140350877193, 0.08771929824561403, 0.09298245614035087]
[0.978984238178634, 0.9912434325744308, 0.9543859649122807, 0.08771929824561403, 0.9070175438596492]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.021015761821366025, 0.008756567425569177, 0.0456140350877193, 0.08771929824561403, 0.09298245614035087]
[0.021015761821366025, 0.008756567425569177, 0.0456140350877193, 0.08771929824561403, 0.09298245614035087]
[0.021015761821366025, 0.008756567425569177, 0.9543859649122807, 0.9122807017543859, 0.9070175438596492]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.02586206896551724, 0.0021598272138228943, 0.017278617710583154, 0.06479481641468683, 0.09719222462203024]
[0.02586206896551724, 0.0021598272138228943, 0.017278617710583154, 0.06263498920086392, 0.09719

# N visits in 12 months

Deals with the definition that specifies a frequent flyer as visiting N times in the space of 12 months

In [51]:
organised = organiseCohortData(cohort[1:])

In [52]:
# N=3
freqs = abstractFreqFlyers(organised, 365, 3)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.010130718954248366, 0.00784441902271613, 0.045595685569537504, 0.08089557117176009, 0.08972054257231574]
[0.010130718954248366, 0.00784441902271613, 0.045595685569537504, 0.08089557117176009, 0.08972054257231574]
[0.010130718954248366, 0.9921555809772838, 0.9544043144304625, 0.08089557117176009, 0.9102794574276842]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.010130718954248366, 0.00784441902271613, 0.045595685569537504, 0.08089557117176009, 0.08972054257231574]
[0.010130718954248366, 0.00784441902271613, 0.045595685569537504, 0.08089557117176009, 0.08972054257231574]
[0.010130718954248366, 0.9921555809772838, 0.045595685569537504, 0.91910442882824, 0.9102794574276842]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.01188707280832095, 0.006155805561451921, 0.007853958819783485, 0.06368074718743366, 0.08131634819532908]
[0.01188707280832095, 0.006155805561451921, 0.007853958819783485, 0.06368074718743366, 0.08

In [53]:
# N=4
freqs = abstractFreqFlyers(organised, 365, 4)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.011211460604173155, 0.00747430706944877, 0.049205854873871066, 0.09280597944565556, 0.0838006230529595]
[0.011211460604173155, 0.00747430706944877, 0.049205854873871066, 0.09280597944565556, 0.0838006230529595]
[0.9887885393958269, 0.9925256929305513, 0.9507941451261289, 0.09280597944565556, 0.9161993769470405]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.011211460604173155, 0.00747430706944877, 0.049205854873871066, 0.09280597944565556, 0.0838006230529595]
[0.011211460604173155, 0.00747430706944877, 0.049205854873871066, 0.09280597944565556, 0.0838006230529595]
[0.9887885393958269, 0.9925256929305513, 0.9507941451261289, 0.9071940205543444, 0.9161993769470405]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.012519561815336464, 0.006259780907668232, 0.008215962441314555, 0.08101761252446184, 0.09158512720156556]
[0.012519561815336464, 0.006259780907668232, 0.008215962441314555, 0.08101761252446184, 0.09158512

In [54]:
# N=5
freqs = abstractFreqFlyers(organised, 365, 5)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.01481103166496425, 0.008175779253960144, 0.045988758303525806, 0.10730710270822688, 0.0914665304036791]
[0.01481103166496425, 0.008175779253960144, 0.045988758303525806, 0.10730710270822688, 0.0914665304036791]
[0.9851889683350358, 0.9918242207460398, 0.9540112416964742, 0.8926928972917731, 0.0914665304036791]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.01481103166496425, 0.008175779253960144, 0.045988758303525806, 0.10730710270822688, 0.0914665304036791]
[0.01481103166496425, 0.008175779253960144, 0.045988758303525806, 0.10730710270822688, 0.0914665304036791]
[0.9851889683350358, 0.9918242207460398, 0.9540112416964742, 0.10730710270822688, 0.9085334695963209]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.016476552598225603, 0.0076045627376425855, 0.008871989860583017, 0.08365019011406843, 0.10519645120405577]
[0.016476552598225603, 0.0076045627376425855, 0.008871989860583017, 0.08365019011406843, 0.105196

In [55]:
# N=6
freqs = abstractFreqFlyers(organised, 365, 6)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.0204241948153967, 0.007069913589945012, 0.0589622641509434, 0.08883647798742138, 0.09276729559748427]
[0.0204241948153967, 0.007069913589945012, 0.0589622641509434, 0.08883647798742138, 0.09276729559748427]
[0.0204241948153967, 0.992930086410055, 0.9410377358490566, 0.9111635220125787, 0.9072327044025157]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0204241948153967, 0.007069913589945012, 0.0589622641509434, 0.08883647798742138, 0.09276729559748427]
[0.0204241948153967, 0.007069913589945012, 0.0589622641509434, 0.08883647798742138, 0.09276729559748427]
[0.9795758051846033, 0.992930086410055, 0.9410377358490566, 0.9111635220125787, 0.09276729559748427]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.023391812865497075, 0.004873294346978557, 0.009746588693957114, 0.0897560975609756, 0.09170731707317073]
[0.023391812865497075, 0.004873294346978557, 0.009746588693957114, 0.0897560975609756, 0.09170731707317073]
[

# N visits every year consitently

Deals with the definition of frequent flyer that specifies a frequent flyer to be someone who consistently comes in more than N times every year

In [56]:
organisedIndex = organiseCohortDataIndex(cohort[1:])
standard = organiseCohortData(cohort[1:])

In [57]:
def getChronicFrequentFlyers(frequentFlyers):
    exists = True
    shortest = 300000
    lowestYear = None
    for key in frequentFlyers:
        if len(frequentFlyers[key]) < shortest:
            lowestYear = key
    
    patientDict = frequentFlyers[lowestYear]
    chronic = []
    for pKey in patientDict:
        inAll = True
        for year in frequentFlyers.keys():
            inAll = inAll and (pKey in frequentFlyers[year])
        if (inAll):
            chronic.append(pKey)
                
    return chronic

In [65]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=3))
regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 1.0, 1.0]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.5, 0.6666666666666666, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 1.0, 1.0]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0, 0.0, 0.0, 0.3333333333333333, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 1.0, 0.0, 0.0]
----------------------
Models on Prescriptions
[0.6666666666666666, 1.0, 0.3333333333333333, 1.0, 1.0]
[0.6666666666666666, 1.0, 0.3333333333333333, 1.0, 1.0]
[0.6666666666666666, 1.0, 0.3333333333333333, 0.0, 0.0]
(array([ True,  True,  True,  True,  True, False,  True, False, False,
        True, False,  True, False,  True, False, False,  True,  True,
       False,  True], dtype=bool), [0.875, 0.125, 0.125, 0.75, 0.125, 0.125, 0.25, 0.0, 0.0, 0.25, 0.0, 0.25, 0.0, 0.5, 0.0, 0.0, 0.125, 0.375, 0.125, 0.125], [0.375, 0.0, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.0, 0.0, 0.

In [66]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=4))
regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 0.0]
----------------------
Models on ETOH, Tobacco and Diabetes
[1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 1.0, 0.0]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.5, 0.0, 0.0, 0.0, 0.0]
[0.5, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
----------------------
Models on Prescriptions
[0.5, 1.0, 0.0, 1.0, 1.0]
[0.5, 1.0, 0.0, 1.0, 1.0]
[0.5, 1.0, 0.0, 1.0, 1.0]
(array([ True,  True,  True,  True,  True, False, False, False, False,
       False, False,  True, False,  True, False, False,  True,  True,
        True,  True], dtype=bool), [0.8, 0.2, 0.2, 0.8, 0.2, 0.0, 0.4, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.4, 0.0, 0.0, 0.2, 0.2, 0.2, 0.2], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
(array([ True, False,  True,  True], dtype=bool), [0.2, 0.0, 0.2, 0.2], [0.0, 0.0, 0.0, 0.0])


In [68]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=5))
frequentsBase = frequentsBase+frequentsBase

regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 0.0]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 1.0]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 1.0]
[1.0, 1.0, 0.0, 0.0, 1.0]
----------------------
Models on Prescriptions
[1.0, 0.0, 0.0, 1.0, 1.0]
[0.0, 0.0, 0.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 0.0, 0.0]
(array([ True, False, False,  True,  True, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
        True, False], dtype=bool), [1.0, 0.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], [0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
(array([False, False, False, False], dtype=bool), [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0])


# Returned within 28 days ever

Defines a frequent flyer to be anyone who has returned 28 days within their discharge period

In [69]:
organised = organiseCohortData(cohort[1:])

freqs28days = abstractFreqFlyers(organised, 28, 2)
regs28days = retrieveRegulars(organised, freqs28days)

runCode(freqs28days, regs28days)
print(runPrescriptions(freqs28days, regs28days))
print(runPrescriptions(freqs28days, regs28days, indicators))

Models on ETOH and Tobacco
[0.008195703464547373, 0.006581398236681982, 0.04272230501738698, 0.0853204172876304, 0.08817685047193244]
[0.008195703464547373, 0.006581398236681982, 0.04272230501738698, 0.0853204172876304, 0.08817685047193244]
[0.9918042965354527, 0.006581398236681982, 0.9572776949826131, 0.9146795827123696, 0.08817685047193244]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.008195703464547373, 0.006581398236681982, 0.04272230501738698, 0.0853204172876304, 0.08817685047193244]
[0.008195703464547373, 0.006581398236681982, 0.04272230501738698, 0.0853204172876304, 0.08817685047193244]
[0.9918042965354527, 0.993418601763318, 0.9572776949826131, 0.9146795827123696, 0.9118231495280675]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.009430784776018861, 0.005053057099545225, 0.0067374094660602995, 0.05086744146875526, 0.08741788782213239]
[0.009430784776018861, 0.005053057099545225, 0.0067374094660602995, 0.05086744146875526, 0.08741