In [53]:
import dask as da
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
#import shap

import dask.dataframe as dd

# Use generator function
path = '../test'
g = (file for file in os.listdir(path) if 'csv' in file)

#read data into a dictionary
dataset = dict()

for file in g:
    file_name = file[:-4]
    if ('condit' in file) or ( 'device' in file ) or ('medi' in file) or ('encount' in file) or ('aller' in file):
        dataset[file_name] = dd.read_csv('../test/'+file,parse_dates=['START','STOP'])
    elif ('patien' in file):
        dataset[file_name] = dd.read_csv('../test/'+file,parse_dates=['BIRTHDATE','DEATHDATE'])
    elif ('proce' in file) or ('imaging' in file):
        dataset[file_name] = dd.read_csv('../test/'+file,parse_dates=['DATE'])
    elif('days' in file):
        dataset[file_name] = dd.read_csv('../test/'+file,names = ['Id','PRED_NUM_DAYS_ICU'],header = 0)
    else:
        dataset[file_name] = dd.read_csv('../test/'+file)
    # print(file_name)
    
    
class Dataset(object):
    """
    Turns dataset dictionary into a class
    """
    
    def __init__(self, data_dict):
        self.data_dict = data_dict
        """Constructor"""
        for key in data_dict:
            setattr(self, key, data_dict[key])
            
        
    def add(self,data_key,data_to_add):
        """Method to add data"""
        self.data_dict.update({data_key: data_to_add})
        
            
test = Dataset(dataset)

In [55]:
medications = test.medications[(test.medications.STOP.dt.year == 2020)
                               |(test.medications.STOP.isna())
                               |(test.medications.STOP.dt.year == 2019)]

patients = test.patients[(test.patients.DEATHDATE.dt.year == 2020)
                         | (test.patients.DEATHDATE.isna())]

patients = patients[['Id','BIRTHDATE','DEATHDATE','MARITAL','RACE','ETHNICITY','GENDER','HEALTHCARE_EXPENSES','HEALTHCARE_COVERAGE']].compute()
# Compute the age of given patient
patients['AGE'] = (np.datetime64('NOW') - patients['BIRTHDATE'].astype('M8[D]')).astype('timedelta64[Y]').astype(float)
# Patients = dd.from_pandas(patients,npartitions=1)
patients = patients.drop(columns = ['DEATHDATE','ETHNICITY','BIRTHDATE'])
covid_stat = pd.read_csv('../test/covid_status_pred_ke.csv')

In [56]:
#Threshold the covid probability
pos_covid_id = covid_stat[covid_stat['predicted_proba']>0.7]['Id']
neg_covid_id = covid_stat[covid_stat['predicted_proba']<0.7]['Id']
patients['COVID_STAT'] = 0
patients.loc[patients['Id'].isin(pos_covid_id),'COVID_STAT'] = 1
patients.loc[patients['Id'].isin(neg_covid_id),'COVID_STAT'] = -1


In [57]:
#Hypertension

medications = test.medications[(test.medications.STOP.dt.year == 2020)
                               |(test.medications.STOP.isna())
                               |(test.medications.STOP.dt.year == 2019)]

hypertension_id = medications[['PATIENT','REASONDESCRIPTION']]
hypertension_id = hypertension_id[hypertension_id['REASONDESCRIPTION']=='Hypertension'].compute()
hypertension_id = hypertension_id['PATIENT'].unique()

patients['HYPERTENSION'] = 0
patients.loc[patients['Id'].isin(hypertension_id),'HYPERTENSION'] = 1

In [58]:
# Diabetes
conditions = test.conditions[((test.conditions['STOP'].dt.year==2020)
				| (test.conditions['STOP'].isna()))]
temp = conditions[['PATIENT','DESCRIPTION']].compute()
obesity_id = temp[temp['DESCRIPTION']=='Body mass index 30+ - obesity (finding)'].PATIENT
prediabetes_id = temp[temp['DESCRIPTION']=='Prediabetes'].PATIENT

patients['OBESITY'] = 0
patients.loc[patients['Id'].isin(obesity_id),'OBESITY'] = 1
patients['MARITAL']=patients['MARITAL'].fillna('S')

In [67]:
days_in_icu = pd.read_csv('../test/days_in_icu.csv',names = ['Id','PRED_NUM_DAYS_ICU'])
patients = pd.merge(patients,days_in_icu,
                   how = 'left',
                   on = 'Id')

In [60]:
import pickle
filename='../train/rfc.model'
models = pickle.load(open(filename, 'rb'))


In [61]:
rf = models['Classifier']
[le_m,le_s,le_r] = models['Encoders']

In [71]:
patients['MARITAL_L']=le_m.transform(patients['MARITAL'])
patients['RACE_L']=le_r.transform(patients['RACE'])
patients['GENDER_L']=le_s.transform(patients['GENDER'])
X = patients[['HEALTHCARE_EXPENSES','HEALTHCARE_COVERAGE','AGE','RACE_L','COVID_STAT','PRED_NUM_DAYS_ICU','HYPERTENSION','GENDER_L','MARITAL_L','OBESITY']]
patients['VENT'] = rf.predict_proba(X)[:,1]

In [82]:
# Now we can make predictions
pred_out = patients[['Id','VENT']]
pred_out = pd.merge(pred_out,patient_id,
                   how = 'right',
                   on = 'Id').fillna(0)
out = pred_out.set_index('Id')
pred_out.to_csv('../ventilation_status.csv',header=False,index = False)