In [21]:
%matplotlib inline
# notebook
import matplotlib.pylab as pylab
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.cluster import KMeans


#make the graphs bigger
pylab.rcParams['figure.figsize'] = (32.0, 24.0)
pylab.rcParams['font.size'] = 24

In [2]:
cohort = []
ETOH = []
hw = []
labOrders = []
medOrders = []
prescriptions = []
radiology = []
tobacco = []
vitals = []

In [3]:
def fileToList(fname):
    dataArray = []
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                dataArray.append(new)
    return dataArray

In [4]:
def fileToDict(fname):
    dataDict = dict()
    with open(fname) as f:
        for line in f:
            new = line.strip()
            new = new[1:len(new) - 1]
            new = new.split('","')
            if (len(new) > 1):
                if not(new[0] in dataDict):
                    dataDict[new[0]] = [new]
                else:
                    dataDict[new[0]].append(new)
    return dataDict

In [5]:
fname = "2018-4169_Cohort.txt"
cohort = fileToList(fname)

fname = "2018-4169_ETOH_Use.txt"
ETOH = fileToDict(fname)

fname = "2018-4169_Height_Weight.txt"
hw = fileToDict(fname)

fname = "2018-4169_Lab_Orders_Performed.txt"
labOrders = fileToDict(fname)


In [6]:
fname = "2018-4169_Med_Orders.txt"
medOrders = fileToDict(fname)

fname = "2018-4169_Prescriptions.txt"
prescriptions = fileToList(fname)

In [7]:
fname = "2018-4169_Tobacco_Use.txt"
tobacco = fileToDict(fname)

In [8]:
fname = "2018-4169_ETOH_Use.txt"
ETOHList = fileToList(fname)

In [9]:
tobList = fileToList(fname)

In [10]:
import collections

#returns a dict of dicts of lists
#dict of years
 #dict of patients
  #list of admissions
def organiseCohortData(myCohort):
    organised = dict()
    for admission in myCohort:
        year = admission[8][0:4]
        if year in organised:
            entry = organised[year]
            if admission[0] in entry:
                organised[year][admission[0]].append(admission[1:])
            else:
                organised[year][admission[0]] = [admission[1:]]
        else:
            patientData = dict()
            patientData[admission[0]] = [admission[1:]]
            organised[year] = patientData
    return organised
    

In [11]:
def getFrequentFlyersBase(dataDict, year=None):
    if (year == None):
        freqDict = dict()
        for key in dataDict:
            freqDict[key] = []
            patientDict = dataDict[key]
            for pKey in patientDict:
                if len(patientDict[pKey]) > 4:
                    freqDict[key].append(pKey)
        return freqDict
    else:
        freqDict= dict()
        if (year in dataDict):
            patientDict = dataDict[year]
            for pKey in patientDict:
                if len(patientDict[pKey]) > 4:
                    freqDict[key].append(pKey)
            return freqDict
        else:
            return dict()

In [12]:
from copy import copy,deepcopy

def getNonFrequentFlyersBase(cohortDict, frequentFlyerDict):
    regular = dict()
    for year in cohortDict:
        regular[year] = dict()
        patList = cohortDict[year]
        for pKey in patList:
            if not(pKey in frequentFlyerDict[year]):
                regular[year][pKey] = cohortDict[year][pKey]
    return regular

In [13]:
organised = organiseCohortData(cohort[1:])
frequentsBase = getFrequentFlyersBase(organised)
regularsBase = getNonFrequentFlyersBase(organised, frequentsBase)

In [14]:
def isBadSmoking(res):
    if (res == "current every day smoker" or res == "current some day smoker" or res == "current every day" or res=="heavy tobacco smoker"):
        return 1
    else:
        return 0
    
def isBad(cat,res):
    if (cat == "Number of Standard Drinks on a Typical Day"):
        if (res == "10 or more" or res == "7 to 9"):
            return 1
        else:
            return 0
    elif (cat == "Alcohol Treatment Offered"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "Excessive Alcohol and Drug Use"):
        if (res == "yes"):
            return 1
        else:
            return 0
    elif (cat == "10.Has a Relative, Friend, Doctor, or Other Health Professional Expressed Concern About Your Drinking or Suggested You Cut Down"):
        if (res == "(2) Yes, during the last year"):
            return 1
        else:
            return 0
    
    return 0

def frequentsArray(frequents):
    patListFreqs = []
    smokesHeavyFreqs = []
    ETOHHeavyFreqs = []
    patientBase = []
    
    for k in frequents.keys():
        patListFreqs = patListFreqs + frequents[k]
        for p in patListFreqs:
            patientBase.append(1)
            if p in ETOH:
                entry = ETOH[p]
                drinkFreq = isBad(entry[0][4], entry[0][5])
                ETOHHeavyFreqs.append(drinkFreq)
            else:
                ETOHHeavyFreqs.append(0)
            
            if p in tobacco:
                entry = tobacco[p]
                smokingFreq = isBadSmoking(entry[0][2])
                smokesHeavyFreqs.append(smokingFreq)
            else:
                smokesHeavyFreqs.append(0)
    return (patientBase, ETOHHeavyFreqs, smokesHeavyFreqs)

In [15]:
def regsArray(regulars):
    patListRegs = []
    smokesHeavyRegs = []
    ETOHHeavyRegs = []
    patientBase = []

    for k in regulars.keys():
        patListRegs = patListRegs + list(regulars[k].keys())

    for p in patListRegs:
        patientBase.append(0)

        if p in ETOH:
            entry = ETOH[p]
            drinkFreq = isBad(entry[0][4], entry[0][5])
            ETOHHeavyRegs.append(drinkFreq)
        else:
            ETOHHeavyRegs.append(0)
        
        if p in tobacco:
            entry = tobacco[p]
            smokingFreq = isBadSmoking(entry[0][2])
            smokesHeavyRegs.append(smokingFreq)
        else:
            smokesHeavyRegs.append(0)
    
    return (patientBase, ETOHHeavyRegs, smokesHeavyRegs)

In [22]:
patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArray(frequentsBase)
patBase2, ETOHHeavyRegs, smokesHeavyRegs = regsArray(regularsBase)

patientBase = patBase1 + patBase2
totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs

print(len(patientBase))
print(len(totalSmokes))
print(len(ETOHTotal))

198728
198728
198728


In [17]:
from collections import Counter
justCol3 = [item[2] for item in tobList]

print(Counter(justCol3))

Counter({'ED Primary Assessment': 18591, 'Emergency Physician Treatment Record': 16961, 'Patient Profile': 4410, 'Consultation, Initial-Psychiatry': 1330, 'CSW Psychosocial Assessment Note-Initial': 852, 'H&P-Primary-Med: General': 693, 'CSW Psychosocial Assessment Note-Initial, Verbal/Paged Consu': 691, 'H&P Note-Primary: Med: General': 631, 'Consultation, Initial-Orthopaedics': 328, 'Consultation, Initial-Neurology': 277, 'H&P Note-Primary: Psychiatry': 236, 'H&P Note-Consulting: Psychiatry': 209, 'Consultation, Initial-Neurosurgery': 199, 'Consultation, Initial-Surg: General/Acute': 182, 'H&P-Primary-Family Medicine': 169, 'Consultation, Initial-Ophthalmology': 157, 'Consultation, Initial-Med: Palliative Care': 128, 'CSW Psychosocial Assessment Note-Progress, Verbal/Paged Cons': 118, 'Consultation, Initial-Med: Gastroenterology': 114, 'H&P Note-Consulting: Neurology': 108, 'Consultation, Initial-Med: Cardiology': 104, 'H&P Note-Consulting: Orthopaedics': 98, 'Consultation, Initial-S

In [19]:
from collections import Counter

justCol4 = [item[4] for item in ETOHList]
print(Counter(justCol4))

Counter({'Number of Standard Drinks on a Typical Day': 27411, 'Frequency of 6 or More Drinks on One Occasion': 17654, 'Alcohol Last Used': 1798, 'Past Alcohol Use': 1206, 'Alcohol Treatment Offered': 794, 'Excessive Alcohol and Drug Use': 339, '10.Has a Relative, Friend, Doctor, or Other Health Professional Expressed Concern About Your Drinking or Suggested You Cut Down': 8, '9.Have You or Someone Else Been Injured as a Result of Your Drinking': 8, '1.How Often Do You Have a Drink': 8, '6.How Often During The Last Year Have You Been Unable to Remember What Happened The Night Before Because You Had Been Drinking': 4, '8.How Often During The last Year Have You Had a Feeling of Guilt or Remorse After Drinking': 4, '2.How Many Drinks Containing Alcohol Do You Have On a Typical Day When You Are Drinking': 4, '3.How Often Do You Have Six or More Drinks On One Occasion': 4, '4.How Often During The Last year Have You Found That You Were Not Able to Stop Drinking Once You Had Started': 4, '5.Ho

In [20]:
x = [item for item in ETOHList if item[4]=="10.Has a Relative, Friend, Doctor, or Other Health Professional Expressed Concern About Your Drinking or Suggested You Cut Down"]
use = [item[5] for item in x]
print(Counter(use))

Counter({'(0) No': 6, '(2) Yes, during the last year': 2})


In [38]:
def difference(l1,l2):
    precisionError = len([b for a,b in zip(l1,l2) if b != a and b == 0 and a == 1])
    recallError = len([b for a,b in zip(l1,l2) if b != a and b == 1 and a == 0])
    print("Precision Error is " + str(precisionError))
    print("Recall Error is " + str(recallError))

    return (precisionError) + (recallError)

In [27]:
runModels(ETOHTotal, totalSmokes, patientBase)

198728
198728
198728
[]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.8279323705530116, 0.18190509736828864, 0.2035727764498679, 0.16120266700213864]


In [28]:
def getFrequentFlyersExtended(dataDict, year=None, visits=10):
    if (year == None):
        freqDict = dict()
        for key in dataDict:
            freqDict[key] = []
            patientDict = dataDict[key]
            for pKey in patientDict:
                if len(patientDict[pKey]) > visits:
                    freqDict[key].append(pKey)
        return freqDict
    else:
        freqDict= dict()
        if (year in dataDict):
            patientDict = dataDict[year]
            for pKey in patientDict:
                if len(patientDict[pKey]) > visits:
                    freqDict[key].append(pKey)
            return freqDict
        else:
            return dict()

In [29]:
frequentsExtended = getFrequentFlyersExtended(organised)
regularsExtended = getNonFrequentFlyersBase(organised, frequentsBase)

In [34]:
patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArray(frequentsExtended)
patBase2, ETOHHeavyRegs, smokesHeavyRegs = regsArray(regularsExtended)

patientBase = patBase1 + patBase2
totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs

print(len(patientBase))
print(len(totalSmokes))
print(len(ETOHTotal))

193243
193243
193243


In [31]:
def runModels(ETOHTotal, totalSmokes, patientBase):
    kf = KFold(n_splits=5)
    print(len(patientBase))
    print(len(totalSmokes))
    print(len(ETOHTotal))
    Xtr = list(map(list,zip(totalSmokes, ETOHTotal)))
    Ytr = patientBase
    differencesKNN = []
    differencesLR = []
    differencesSVM = []
    differencesKM = []

    for train_index, test_index in kf.split(Xtr):
        X_train, X_test = [Xtr[i] for i in train_index], [Xtr[i] for i in test_index]
        y_train, y_test = [Ytr[i] for i in train_index], [Ytr[i] for i in test_index]
    
        if (1 in y_train and 0 in y_train):
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            PredLR = lr.predict(X_test)
            differencesLR.append(difference(PredLR, y_test)/len(y_test))
    
            km = KMeans(n_clusters=2)
            km.fit(X_train, y_train)
            PredKM = km.predict(X_test)
            differencesKM.append(difference(PredKM, y_test)/len(y_test))
    
            clf = svm.SVC()
            clf.fit(X_train, y_train)
            PredSVM = clf.predict(X_test)
            differencesSVM.append(difference(PredSVM, y_test)/len(y_test))
    print((differencesKNN))
    print((differencesLR))
    print((differencesSVM))
    print((differencesKM))

In [35]:
runModels(ETOHTotal, totalSmokes, patientBase)

193243
193243
193243
[]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.16670547750265208, 0.8153380423814329, 0.7961084661560753, 0.1596460360173877]


In [32]:
def getFrequentFlyersMultiYears(organised):
    keys = list(organised.keys())
    keys = sorted(keys)
    orgYears = dict()
    for l in range(len(keys)-2):
        newKey = (keys[l]+keys[l+1]+keys[l+2])
        orgYears[newKey] = dict()
        patKeys = list(organised[keys[l]].keys())+list(organised[keys[l+1]].keys())+list(organised[keys[l+2]].keys())
        for p in patKeys:
            try:
                key1 = organised[keys[l]][p]
            except:
                key1 = []
                
            try:
                key2 = organised[keys[l+1]][p]
            except:
                key2 = []
                
            try:
                key3 = organised[keys[l+2]][p]
            except:
                key3 = []
            orgYears[newKey][p] = key1 + key2 + key3
    frequentFs = getFrequentFlyersExtended(orgYears)
    regs = getNonFrequentFlyersBase(orgYears, frequentFs)
    return (frequentFs, regs)

In [33]:
frequentsYears, regsYears = getFrequentFlyersMultiYears(organised)

In [37]:
patBase1, ETOHHeavyFreqs, smokesHeavyFreqs = frequentsArray(frequentsYears)
patBase2, ETOHHeavyRegs, smokesHeavyRegs = regsArray(regsYears)

patientBase = patBase1 + patBase2
totalSmokes = smokesHeavyFreqs + smokesHeavyRegs
ETOHTotal = ETOHHeavyFreqs + ETOHHeavyRegs

print(len(patientBase))
print(len(totalSmokes))
print(len(ETOHTotal))

runModels(ETOHTotal, totalSmokes, patientBase)

399318
399318
399318
399318
399318
399318
[]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.19087448662726636, 0.19004808173895624, 0.16957790215744462, 0.14790328437449132]


In [120]:
getFrequentFlyersMultiYears(frequentsYears)

201220132014
201320142015
201420152016
201520162017
201620172018


In [None]:
def abstractFreqFlyers(cohortDict, gap, visits):
    