In [1]:
%matplotlib inline
# notebook
import matplotlib.pylab as pylab
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np

#make the graphs bigger
pylab.rcParams['figure.figsize'] = (32.0, 24.0)
pylab.rcParams['font.size'] = 24

In [2]:
cohort = []
ETOH = []
hw = []
labOrders = []
medOrders = []
prescriptions = []
radiology = []
tobacco = []
vitals = []

In [3]:
def fileToList(fname):
    dataArray = []
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                dataArray.append(new)
    return dataArray

In [4]:
def fileToDict(fname):
    dataDict = dict()
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                if not(new[0] in dataDict):
                    dataDict[new[0]] = [new]
                else:
                    dataDict[new[0]].append(new)
    return dataDict

In [5]:
fname = "2018-4169_Cohort.txt"
cohort = fileToList(fname)

fname = "2018-4169_ETOH_Use.txt"
ETOH = fileToDict(fname)

fname = "2018-4169_Height_Weight.txt"
hw = fileToDict(fname)

fname = "2018-4169_Lab_Orders_Performed.txt"
labOrders = fileToDict(fname)


In [6]:
fname = "2018-4169_Med_Orders.txt"
medOrders = fileToDict(fname)

fname = "2018-4169_Prescriptions.txt"
prescriptions = fileToList(fname)

In [7]:
fname = "2018-4169_Tobacco_Use.txt"
tobacco = fileToDict(fname)

In [8]:
fname = "2018-4169_ETOH_Use.txt"
ETOHList = fileToList(fname)

In [9]:
tobList = fileToList(fname)

In [10]:
#organises data into a dict of dict of lists
#dict of years
    #dict of patients
        #entry of admissions
def organiseCohortDataIndex(myCohort):
    organised = dict()
    for admission in myCohort:
        year = admission[8][0:4]
        if year in organised:
            entry = organised[year]
            if admission[0] in entry:
                organised[year][admission[0]].append(admission[1:])
            else:
                organised[year][admission[0]] = [admission[1:]]
        else:
            patientData = dict()
            patientData[admission[0]] = [admission[1:]]
            organised[year] = patientData
    return organised

#gets non frequent flyers from the organised cohort dict above
def getNonFrequentFlyersBase(cohortDict, frequentFlyerDict):
    regular = dict()
    for year in cohortDict:
        regular[year] = dict()
        patList = cohortDict[year]
        for pKey in patList:
            if not(pKey in frequentFlyerDict[year]):
                regular[year][pKey] = cohortDict[year][pKey]
    return regular

#gets frequent flyers from the organised cohort dict above (defined by greater than or equal to a specified visit num)
def getFrequentFlyersExtended(dataDict, year=None, visits=10):
    if (year == None):
        freqDict = dict()
        for key in dataDict:
            freqDict[key] = []
            patientDict = dataDict[key]
            for pKey in patientDict:
                if len(patientDict[pKey]) >= visits:
                    freqDict[key].append(pKey)
        return freqDict
    else:
        freqDict= dict()
        if (year in dataDict):
            patientDict = dataDict[year]
            for pKey in patientDict:
                if len(patientDict[pKey]) >= visits:
                    freqDict[key].append(pKey)
            return freqDict
        else:
            return dict()

In [11]:
import collections
from datetime import datetime

#returns a dict of lists
    #dict of patients
        #list of admissions
def organiseCohortData(myCohort):
    organised = dict()
    for admission in myCohort:
        if admission[0] in organised:
            organised[admission[0]].append(admission[1:])
        else:
            organised[admission[0]] = [admission[1:]]
            
    
    for p in organised.keys():
        organised[p] = sorted(organised[p], key = lambda x: datetime.strptime(x[8][0:10], '%Y-%m-%d'))
    return organised
    

In [12]:
organised = organiseCohortData(cohort[1:])

In [13]:
from datetime import datetime, timedelta

#method for finding frequent flyers where you specify a gap/time frame in days(e.g. 12 months, one month, one week etc)
#and a number of visits, and this method will get frequent flyers for that definition
#uses the organisation of the cohort data specified by the organiseCohortData method (aka dict of lists)
def abstractFreqFlyers(organised, gap, visits):
    frequentFlyers = []
    for p in organised.keys():
        dates = [datetime.strptime(item[8][0:10], '%Y-%m-%d') for item in organised[p]]
        try:
            for i in range(len(dates)):
                modified_date = dates[i] + timedelta(days=gap)
                if dates[i+visits-1] <= modified_date:
                    frequentFlyers.append(p)
                    break
        except:
            pass
    return frequentFlyers

In [14]:
frequents = abstractFreqFlyers(organised, 365,20) #as a test, this calculates frequent flyers that, in the space of a year, have made 10 visits

In [15]:
print(len(frequents))

126


In [16]:
print(frequents[0:5])

['0.54629537974305198', '0.18454035455437792', '0.49137990165585754', '0.31499293513474674', '7.0830591556572897E-2']


In [17]:
#def difference(l1,l2):
    #precisionError = len([b for a,b in zip(l1,l2) if b != a and b == 0 and a == 1])
    #recallError = len([b for a,b in zip(l1,l2) if b != a and b == 1 and a == 0])
    #print("Precision Error is " + str(precisionError))
    #print("Recall Error is " + str(recallError))
    #return (precisionError) + (recallError)
    
def difference(l1,l2):
    return len([b for a,b in zip(l1,l2) if b != a])  

In [18]:
#given a set of data training points (Xtr) and classes (patientBase), run a number of models 
#use a K-fold algorithm to get error across 4 runs overall
def runModels(Xtr, patientBase):
    kf = KFold(n_splits=5)
    Ytr = patientBase
    differencesLR = []
    differencesSVM = []
    differencesKM = []

    for train_index, test_index in kf.split(Xtr):
        X_train, X_test = [Xtr[i] for i in train_index], [Xtr[i] for i in test_index]
        y_train, y_test = [Ytr[i] for i in train_index], [Ytr[i] for i in test_index]
    
        if (1 in y_train and 0 in y_train):
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            PredLR = lr.predict(X_test)
            differencesLR.append(difference(PredLR, y_test)/len(y_test))
    
            km = KMeans(n_clusters=2)
            km.fit(X_train, y_train)
            PredKM = km.predict(X_test)
            differencesKM.append(difference(PredKM, y_test)/len(y_test))

            clf = svm.SVC()
            clf.fit(X_train, y_train)
            PredSVM = clf.predict(X_test)
            differencesSVM.append(difference(PredSVM, y_test)/len(y_test))
    print((differencesLR))
    print((differencesSVM))
    print((differencesKM))

In [19]:
#given a sentence, is the patient a serious smoker
def isBadSmoking(res):
    if (res == "current every day smoker" or res == "current some day smoker" or res == "current every day" or res=="heavy tobacco smoker"):
        return 1
    else:
        return 0
    
#given a sentence, is the patient a big drinker
def isBad(cat,res):
    if (cat == "Number of Standard Drinks on a Typical Day"):
        if (res == "10 or more" or res == "7 to 9"):
            return 1
        else:
            return 0
    elif (cat == "Alcohol Treatment Offered"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "Excessive Alcohol and Drug Use"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "10.Has a Relative, Friend, Doctor, or Other Health Professional Expressed Concern About Your Drinking or Suggested You Cut Down"):
        if (res == "(2) Yes, during the last year"):
            return 1
        else:
            return 0
    
    return 0

#given a list of frequent flyer ids, return 3 lists: 
#first list = list of 1s the size of frequents (= list of classes for our model)
#list of 1s and 0s corresponding to whether the patient drinks a lot of alcohol (ETOH results)
#list of 1s and 0s corresponding to whether the patient smokes a lot (tobacco results)

def frequentsArray(frequents, value):
    smokesHeavyFreqs = []
    ETOHHeavyFreqs = []
    patientBase = []
    
    for p in frequents:
        patientBase.append(value)
        if p in ETOH:
            entry = ETOH[p]
            drinkFreq = isBad(entry[0][4], entry[0][5])
            ETOHHeavyFreqs.append(drinkFreq)
        else:
            ETOHHeavyFreqs.append((value-1)*-1)
        if p in tobacco:
            entry = tobacco[p]
            smokingFreq = isBadSmoking(entry[0][2])
            smokesHeavyFreqs.append(smokingFreq)
        else:
            smokesHeavyFreqs.append(0)
                
    return (patientBase, ETOHHeavyFreqs, smokesHeavyFreqs)

In [20]:
#given a list of ids, returns a list where it is 1 if patient is diabetic, and 0 if not
def getDiabetes(deIds):
    diabetes = []
    for patient in deIds:
        if patient in organised:
            entry = organised[patient][0]
            if (entry[-1] == "Diabetes"):
                diabetes.append(1)
            else:
                diabetes.append(0)
    return diabetes

In [21]:
#calculates Xtr and Ytr and runs models 
def runCode(frequents, regs):
    patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArray(frequents, 1)
    patBase2, ETOHHeavyRegs, smokesHeavyRegs = frequentsArray(regs, 0)

    patientBase = patBase1 + patBase2
    totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
    ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs
    DiabetesTotal = getDiabetes(frequents) + getDiabetes(regs)
    PrescriptionsTotal = prescIndicators(frequents) + prescIndicators(regs)

    Xtr = list(map(list,zip(totalSmokes, ETOHTotal)))
    print("Models on ETOH and Tobacco")
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco and Diabetes")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal, PrescriptionsTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on Prescriptions")
    Xtr = list(map(list,zip(PrescriptionsTotal)))
    runModels(Xtr, patientBase)

In [22]:
#same as above but this time, the frequents and regs parameters are dicts (extra dimension of year), not lists
def runCodeDicts(frequents, regs):
    patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArrayOrg(frequentsBase)
    patBase2, ETOHHeavyRegs, smokesHeavyRegs = regsArrayOrg(regularsBase)

    patientBase = patBase1 + patBase2
    totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
    ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs
    
    listFreqs = []
    for f in frequents.keys():
        listFreqs = listFreqs + frequents[f]

    listFreqs = list(set(listFreqs))
    DiabetesTotal = getDiabetes(listFreqs) + getDiabetes(regs)
    PrescriptionsTotal = prescIndicators(listFreqs) + prescIndicators(regs)

    Xtr = list(map(list,zip(totalSmokes, ETOHTotal)))
    print("Models on ETOH and Tobacco")
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco and Diabetes")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal, DiabetesTotal, PrescriptionsTotal)))
    runModels(Xtr, patientBase)
    print("----------------------")
    print("Models on ETOH, Tobacco, Diabetes and Prescriptions")
    Xtr = list(map(list,zip(PrescriptionsTotal)))
    runModels(Xtr, patientBase)

In [23]:
from collections import Counter

names = [item[3] for item in prescriptions]
organisedPrescriptions = fileToDict("2018-4169_Prescriptions.txt")
ordered = Counter(names[1:])
orderedPrescList = list(ordered.keys())

In [24]:
def retrieveRegulars(org, frequents):
    regularsBase = []
    newOrder = Counter(frequents)
    for k in org.keys():
        if not(k in newOrder):
            regularsBase.append(k)
    return regularsBase

In [25]:
def runPrescriptions(frequents, regs, top=orderedPrescList[0:20]):
    
    medList = []
    for med in top:
        counter = 0
        for f in frequents:
            if (f in organisedPrescriptions):
                if (med in [item[3] for item in organisedPrescriptions[f]]):
                    counter = counter + 1
        medList.append(counter)
    
    medList2 = []
    for med in top:
        counter = 0
        for f in regs28days:
            if (f in organisedPrescriptions):
                if (med in [item[3] for item in organisedPrescriptions[f]]):
                    counter = counter + 1
        medList2.append(counter)
    
    x = ([(item / len(freqs28days)) for item in medList])
    y = ([(item / len(regs28days)) for item in medList2])

    x1 = np.array(x)
    y1 = np.array(y)

    return(x1 > y1, x, y)

In [26]:
indicatorIndices = [2, 8, 16, 19]
indicators = [orderedPrescList[k] for k in indicatorIndices]
print(indicators)

['citalopram', 'chlordiazePOXIDE', 'QUEtiapine', 'mirtazapine']


In [27]:
def prescIndicators(deIds):
    prescs = []
    for patient in deIds:
        if patient in organisedPrescriptions:
            entry = organisedPrescriptions[patient]
            data = [item[3] for item in entry]
            if (not set(data).isdisjoint(indicators)):
                prescs.append(1)
            else:
                prescs.append(0)
    return prescs

# N or more during indexed year

Follwing code blocks deal with the case where the definion of a frequent flyer is N visits per index year (2018, 1017, 2016 etc)

In [46]:
#specialised routines that get the ETOH,tobacco, and patientBase lists for dict base data 
#(aka is same as frequentsAray above, but works on dicts)
def frequentsArrayOrg(frequents):
    patListFreqs = []
    smokesHeavyFreqs = []
    ETOHHeavyFreqs = []
    patientBase = []
    
    for k in frequents.keys():
        patListFreqs = patListFreqs + frequents[k]
        for p in patListFreqs:
            patientBase.append(1)
            if p in ETOH:
                entry = ETOH[p]
                drinkFreq = isBad(entry[0][4], entry[0][5])
                ETOHHeavyFreqs.append(drinkFreq)
            else:
                ETOHHeavyFreqs.append(0)
            
            if p in tobacco:
                entry = tobacco[p]
                smokingFreq = isBadSmoking(entry[0][2])
                smokesHeavyFreqs.append(smokingFreq)
            else:
                smokesHeavyFreqs.append(0)
    return (patientBase, ETOHHeavyFreqs, smokesHeavyFreqs)

def regsArrayOrg(regulars):
    patListRegs = []
    smokesHeavyRegs = []
    ETOHHeavyRegs = []
    patientBase = []

    for k in regulars.keys():
        patListRegs = patListRegs + list(regulars[k].keys())

    for p in patListRegs:
        patientBase.append(0)

        if p in ETOH:
            entry = ETOH[p]
            drinkFreq = isBad(entry[0][4], entry[0][5])
            ETOHHeavyRegs.append(drinkFreq)
        else:
            ETOHHeavyRegs.append(0)
        
        if p in tobacco:
            entry = tobacco[p]
            smokingFreq = isBadSmoking(entry[0][2])
            smokesHeavyRegs.append(smokingFreq)
        else:
            smokesHeavyRegs.append(0)
    
    return (patientBase, ETOHHeavyRegs, smokesHeavyRegs)

In [47]:
organisedIndex = organiseCohortDataIndex(cohort[1:])

In [48]:
frequentsBase = getFrequentFlyersExtended(organisedIndex, visits=4)
regularsBase = getNonFrequentFlyersBase(organisedIndex, frequentsBase)
runCodeDicts(frequentsBase, regularsBase)

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.8001246192190529, 0.1637626641434539, 0.8054049064180379, 0.15889317117075535]
----------------------
Models on ETOH, Tobacco and Diabetes
[]
[]
[]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[]
[]
[]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[]
[]
[]


In [49]:
listFreqs = []
for f in frequentsBase.keys():
    listFreqs = listFreqs + frequentsBase[f]
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

(array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False], dtype=bool), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.176199660546212, 0.0016355500694337293, 0.006557629995371085, 0.02901558401481253, 0.0014889677518901403, 0.01679524764696806, 0.0665869464588798, 0.00022373090572442523, 0.0017204135164326493, 0.025034716864681377, 0.0031476623977781206, 0.011657151674124363, 0.0053772565962042895, 0.004297176361672581, 0.00031630921154142877, 0.0010106465051689553, 0.011340842462582934, 0.005361826878568122, 0.03372164789384354, 0.0036491282209535564])
(array([False, False, False, False], dtype=bool), [0.0, 0.0, 0.0, 0.0], [0.006557629995371085, 0.0017204135164326493, 0.011340842462582934, 0.0036491282209535564])


# N visits 2 years after initial visit

Deals with the definiton of frequent flyers that defines a frequent flyer as returning N times within a space of 2 years after a given visit

In [40]:
organised = organiseCohortData(cohort[1:])

In [41]:
freqs = abstractFreqFlyers(organised, 730, 10)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15398998330550917, 0.14998330550918196, 0.1419365609348915, 0.14647746243739565]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.8460100166944908, 0.14998330550918196, 0.1419365609348915, 0.14647746243739565]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.00010430791697089809, 0.0005215395848544905, 0.0003129237509126943, 0.00020861583394179617]
[0.0013038489621362262, 0.002138312297903411, 0.0018775425054761655, 0.00020861583394179617]
[0.1564097214978617, 0.15374986961510378, 0.8497444456034213, 0.14509231250651924]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.03937623865651403, 0.04208824449775738, 0.03911546886408678, 0.039741316365912174]
(array([False,  True, False, False,  True, False, False, False,  True,
       False, False, False, False,  True, False, False, False,  True,


# N visits in 12 months

Deals with the definition that specifies a frequent flyer as visiting N times in the space of 12 months

In [35]:
organised = organiseCohortData(cohort[1:])

In [36]:
# N=3
freqs = abstractFreqFlyers(organised, 365, 3)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.08637729549248747, 0.08470784641068448, 0.08641068447412353, 0.08297161936560934]
[0.08637729549248747, 0.08470784641068448, 0.08641068447412353, 0.08297161936560934]
[0.08637729549248747, 0.08470784641068448, 0.9135893155258764, 0.08297161936560934]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.08637729549248747, 0.08470784641068448, 0.08641068447412353, 0.08297161936560934]
[0.08637729549248747, 0.08470784641068448, 0.08641068447412353, 0.08297161936560934]
[0.08637729549248747, 0.08470784641068448, 0.08641068447412353, 0.08297161936560934]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.09080004172316679, 0.08542818399916553, 0.08787942004798165, 0.08224679253155315]
[0.09080004172316679, 0.08542818399916553, 0.08787942004798165, 0.08224679253155315]
[0.09080004172316679, 0.08542818399916553, 0.08787942004798165, 0.08224679253155315]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 

In [37]:
# N=4
freqs = abstractFreqFlyers(organised, 365, 4)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.7933889816360601, 0.14447412353923206, 0.8642404006677796, 0.13996661101836394]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.007779632721202003, 0.008380634390651086, 0.008180300500834724, 0.007212020033388981]
[0.007779632721202003, 0.008380634390651086, 0.008180300500834724, 0.007212020033388981]
[0.7933889816360601, 0.7966611018363939, 0.13575959933222037, 0.13996661101836394]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.09664128507353709, 0.09121727339105037, 0.09492020444351726, 0.08871388338374883]
[0.09664128507353709, 0.09121727339105037, 0.09492020444351726, 0.06216751851465526]
[0.8520392197767811, 0.9087827266089497, 0.09492020444351726, 0.08871388338374883]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.03708146448315427, 0.039480546573484925, 0.03655992489829978, 0.03765515802649421]
(array([ True,  True,  T

In [38]:
# N=5
freqs = abstractFreqFlyers(organised, 365, 5)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15041736227045074, 0.14664440734557596, 0.1383973288814691, 0.14263772954924875]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15041736227045074, 0.8533555926544241, 0.1383973288814691, 0.8573622704507513]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.01199541045165328, 0.011786794617711484, 0.013455721289245853, 0.010535099614060708]
[0.01199541045165328, 0.011786794617711484, 0.013455721289245853, 0.010535099614060708]
[0.15135078752477313, 0.14848231980807344, 0.1472827787629081, 0.14415354125378116]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.03744654219255242, 0.04031500990925211, 0.03749869615103786, 0.03775946594346511]
(array([False,  True, False,  True,  True,  True, False, False,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
       False,

In [39]:
# N=6
freqs = abstractFreqFlyers(organised, 365, 6)
regs = retrieveRegulars(organised, freqs)

runCode(freqs, regs)
print(runPrescriptions(freqs, regs))
print(runPrescriptions(freqs, regs, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15198664440734558, 0.14797996661101837, 0.13913188647746244, 0.1447078464106845]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.8480133555926544, 0.14797996661101837, 0.8608681135225376, 0.1447078464106845]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0038593929279232293, 0.004328778554292271, 0.004746010222175863, 0.003494315218525086]
[0.0038593929279232293, 0.004328778554292271, 0.004746010222175863, 0.003494315218525086]
[0.8477104412224888, 0.15119432564931679, 0.14853447376655887, 0.1447272347971211]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.038489621362261396, 0.041097319286533845, 0.03812454365286325, 0.9613539167622822]
(array([False,  True, False, False,  True, False, False, False,  True,
       False,  True,  True, False,  True, False, False, False,  True,
       Fa

# N visits every year consitently

Deals with the definition of frequent flyer that specifies a frequent flyer to be someone who consistently comes in more than N times every year

In [30]:
organisedIndex = organiseCohortDataIndex(cohort[1:])
standard = organiseCohortData(cohort[1:])

In [31]:
def getChronicFrequentFlyers(frequentFlyers):
    exists = True
    shortest = 300000
    lowestYear = None
    for key in frequentFlyers:
        if len(frequentFlyers[key]) < shortest:
            lowestYear = key
    
    patientDict = frequentFlyers[lowestYear]
    chronic = []
    for pKey in patientDict:
        inAll = True
        for year in frequentFlyers.keys():
            inAll = inAll and (pKey in frequentFlyers[year])
        if (inAll):
            chronic.append(pKey)
                
    return chronic

In [32]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=3))
regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15736227045075124, 0.1530550918196995, 0.14360601001669449, 0.1485475792988314]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15736227045075124, 0.1530550918196995, 0.14360601001669449, 0.8514524207011687]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15922603525607593, 0.8433816626681965, 0.8469281318452071, 0.1477000104307917]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.041410243037446545, 0.04427871075414624, 0.04031500990925211, 0.041097319286533845]
(array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False], dtype=bool), [0.00034772241816104514, 4.967463116586359e-05, 4.967463116586359e-05, 0.00029804778699518155, 4.967463116586359

In [33]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=4))
regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.1573288814691152, 0.1530550918196995, 0.14360601001669449, 0.1485475792988314]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.1573288814691152, 0.8469449081803005, 0.14360601001669449, 0.1485475792988314]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.1591738812975905, 0.1566183373318035, 0.15307186815479296, 0.14775216438927716]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.041410243037446545, 0.04427871075414624, 0.04031500990925211, 0.041097319286533845]
(array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False], dtype=bool), [0.00019869852466345437, 4.967463116586359e-05, 4.967463116586359e-05, 0.00019869852466345437, 4.967463116586359e

In [34]:
frequentsBase = getChronicFrequentFlyers(getFrequentFlyersExtended(organisedIndex, visits=5))

regularsBase = retrieveRegulars(standard, frequentsBase)

runCode(frequentsBase, regularsBase)
print(runPrescriptions(frequentsBase, regularsBase))
print(runPrescriptions(frequentsBase, regularsBase, indicators))

Models on ETOH and Tobacco
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15736227045075124, 0.15302170283806343, 0.14360601001669449, 0.14858096828046743]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.15736227045075124, 0.15302170283806343, 0.8563939899833055, 0.14858096828046743]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.1591738812975905, 0.1566183373318035, 0.15307186815479296, 0.8522478356107228]
----------------------
Models on Prescriptions
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.041410243037446545, 0.04427871075414624, 0.04031500990925211, 0.041149473245019295]
(array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False], dtype=bool), [9.934926233172718e-05, 0.0, 0.0, 9.934926233172718e-05, 4.967463116586359e-05, 0.0, 4.967463116586359e-05, 0

# Returned within 28 days ever

Defines a frequent flyer to be anyone who has returned 28 days within their discharge period

In [29]:
organised = organiseCohortData(cohort[1:])

freqs28days = abstractFreqFlyers(organised, 28, 2)
regs28days = retrieveRegulars(organised, freqs28days)

runCode(freqs28days, regs28days)
print(runPrescriptions(freqs28days, regs28days))
print(runPrescriptions(freqs28days, regs28days, indicators))

Models on ETOH and Tobacco
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.08330550918196995]
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.08330550918196995]
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.91669449081803]
----------------------
Models on ETOH, Tobacco and Diabetes
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.08330550918196995]
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.08330550918196995]
[0.08814691151919866, 0.08457429048414024, 0.08517529215358932, 0.08330550918196995]
----------------------
Models on ETOH, Tobacco, Diabetes and Prescriptions
[1.0, 0.08855742150829247, 0.08772295817252529, 0.08579326170856368, 0.08318556378429123]
[1.0, 0.08855742150829247, 0.08772295817252529, 0.08579326170856368, 0.08318556378429123]
[0.7331803483884427, 0.08855742150829247, 0.08772295817252529, 0.9142067382914363, 0.08318556378429123]
----------------------
Models on Prescriptions
