In [304]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from itertools import combinations
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [524]:
ddf_patients = dd.read_csv('../data/raw/train/patients.csv')
ddf_procedures = dd.read_csv('../data/raw/train/procedures.csv')
ddf_conditions = dd.read_csv('../data/raw/train/conditions.csv')
ddf_encounters = dd.read_csv('../data/raw/train/encounters.csv')
ddf_careplans = dd.read_csv('../data/raw/train/careplans.csv')
ddf_observations = dd.read_csv('../data/raw/train/observations.csv', dtype=str)
ddf_alergies = dd.read_csv('../data/raw/train/observations.csv')

In [525]:
covid_patients_ids = ddf_conditions[ddf_conditions.CODE == 840539006].PATIENT.unique().compute()

icu_ids = ddf_encounters[
    (ddf_encounters.CODE == 305351004) & 
    (ddf_encounters.PATIENT.isin(covid_patients_ids))].PATIENT.compute()

ddf_icu_encounters = ddf_encounters[
    (ddf_encounters.CODE == 305351004) & 
    (ddf_encounters.PATIENT.isin(covid_patients_ids))]

days_in_icu = ((ddf_icu_encounters['STOP'].astype('M8[s]')\
                - ddf_icu_encounters['START'].astype('M8[s]')
               ).astype('timedelta64[s]').astype(float)/86400).compute()

patient_list = ddf_icu_encounters.PATIENT.values.compute()

icu_target = pd.DataFrame(days_in_icu.values, columns=['DAYS_IN_ICU'], index=patient_list)

icu_target

Unnamed: 0,DAYS_IN_ICU
c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,7.128472
52e1f151-e98b-4163-a250-5e7d5c6ff813,9.399306
534e1ef3-b3c7-42da-a022-fea54d6c828f,7.151389
5158d043-8c93-4987-8fcb-5f94e7e32fa1,5.811806
b5499d4a-6660-4b09-bfc0-e3d33e3dcb5e,7.941667
...,...
9c92df1f-e325-43c2-9bc0-1380eeaf3958,6.694444
6cda9ea0-5264-4465-9a9c-8315994a488f,5.938889
9529dc13-c7f1-4f76-b903-4e310b7805c8,4.180556
65d3785a-6150-4b86-8842-c40ca654e3c4,9.905556


In [526]:
def numeric_like(x):
    try:
        float(x)
        return True
    except:
        return False

# Create Condition Features

In [527]:
conditions_subset = [65710008, 69896004, 195967001]
covid_patient_conditions = ddf_conditions[ddf_conditions.PATIENT.isin(icu_ids)]
condition_dictionary = {code: covid_patient_conditions[covid_patient_conditions['CODE']==code]['PATIENT'].unique().compute() for code in conditions_subset}
condition_description = pd.DataFrame(ddf_conditions['DESCRIPTION'].values.compute(), index=ddf_conditions['CODE'].values.compute()).to_dict()[0]

In [528]:
def create_feature_dictionary(ddf, patient_subset, column, value_subset):
    patient_feature = ddf[ddf.PATIENT.isin(patient_subset)]
    patient_feature_dictionary = {code: patient_feature[patient_feature[column]==value]['PATIENT'].unique().compute() for value in value_subset}
    return patient_feature_dictionary

def create_binary_feature(ddf, patient_feature_dictionary, value):
    temp1 = patient_feature_dictionary[value]
    temp2 = ddf[~ddf.PATIENT.isin(patient_feature_dictionary[value].values)].PATIENT.unique().compute()
    df1 = pd.DataFrame(np.ones(len(temp1)).astype(int), index=temp1)
    df2 = pd.DataFrame(np.zeros(len(temp2)).astype(int), index=temp2)
    df = pd.concat([df1, df2])
    df.columns = [condition_description[condition]]
    return df

In [529]:
df_features = create_condition_feature_table(conditions_subset)
for x in combinations(range(8), 2):
    df_features[f'comorbidity_{x}'] = df_features.iloc[:, x[0]]*df_features.iloc[:, x[1]]

# Create Procedure Features

In [530]:
df = ddf_procedures[['PATIENT', 'CODE']].compute()

In [531]:
codes = df['CODE'].value_counts()[df['CODE'].value_counts()>1000].index

In [532]:
df['Value'] = 1

In [533]:
df = df[df['CODE'].isin(codes)].pivot_table(index='PATIENT', columns='CODE', values='Value', aggfunc='mean').fillna(0)

In [534]:
df = df[df.index.isin(icu_ids)]

In [535]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Observation Features

In [536]:
df = ddf_observations[['PATIENT', 'CODE', 'VALUE']].compute()
df = df[df['PATIENT'].isin(icu_ids)]
df = df[df['VALUE'].map(numeric_like)]
feature_codes = ['14804-9', '1960-4', '1988-5', '19994-3', '2019-8', '2157-6',
       '2160-0', '2276-4', '2703-7', '2708-6', '2744-1', '2885-2',
       '33959-8', '4548-4', '48065-7', '5902-2', '6301-6', '704-7',
       '711-2', '731-0', '8310-5', '89579-7']
df = df[df['CODE'].isin(feature_codes)]
df['VALUE'] = df['VALUE'].astype(float)
df = df.pivot_table(index='PATIENT', columns='CODE', values='VALUE', aggfunc='mean').fillna(0)

In [537]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Demographic Features

In [538]:
temp = (np.datetime64('NOW') - ddf_patients['BIRTHDATE'].astype('M8[D]').compute()).astype('timedelta64[Y]').astype(float)
df_age = pd.DataFrame(temp.values, index = ddf_patients.compute().Id)
df_age.columns = ['AGE']

In [539]:
df = ddf_patients[ddf_patients['Id'].isin(icu_ids)][['Id', 'GENDER', 'HEALTHCARE_COVERAGE']].compute()

In [540]:
df['GENDER'] = df['GENDER'].map(lambda x: int(x=='M'))
df.index = df['Id']

In [541]:
df

Unnamed: 0_level_0,Id,GENDER,HEALTHCARE_COVERAGE
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,1,8303.87
52e1f151-e98b-4163-a250-5e7d5c6ff813,52e1f151-e98b-4163-a250-5e7d5c6ff813,0,14480.93
534e1ef3-b3c7-42da-a022-fea54d6c828f,534e1ef3-b3c7-42da-a022-fea54d6c828f,1,15017.50
5158d043-8c93-4987-8fcb-5f94e7e32fa1,5158d043-8c93-4987-8fcb-5f94e7e32fa1,1,0.00
b5499d4a-6660-4b09-bfc0-e3d33e3dcb5e,b5499d4a-6660-4b09-bfc0-e3d33e3dcb5e,0,161162.15
...,...,...,...
9c92df1f-e325-43c2-9bc0-1380eeaf3958,9c92df1f-e325-43c2-9bc0-1380eeaf3958,1,150692.07
6cda9ea0-5264-4465-9a9c-8315994a488f,6cda9ea0-5264-4465-9a9c-8315994a488f,0,120989.97
9529dc13-c7f1-4f76-b903-4e310b7805c8,9529dc13-c7f1-4f76-b903-4e310b7805c8,0,20740.26
65d3785a-6150-4b86-8842-c40ca654e3c4,65d3785a-6150-4b86-8842-c40ca654e3c4,0,66791.97


In [542]:
df_features = pd.merge(df.iloc[:, 1:], df_features, left_index=True, right_index=True)

In [544]:
df_features = pd.merge(df_age, df_features, left_index=True, right_index=True)[['AGE', 'HEALTHCARE_COVERAGE', '2160-0', '2708-6', '2885-2', '4548-4', '8310-5', 'Acute respiratory failure (disorder)']]

# Modelling

In [545]:
df_training_table = pd.merge(df_features, icu_target, how='outer', left_index=True, right_index=True).fillna(0)

In [546]:
X = df_training_table.iloc[:, :-1].values
y = df_training_table.iloc[:, -1].values

In [549]:
fs = SelectKBest(score_func=mutual_info_regression, k='all')

In [550]:
fs.fit_transform(X, y)

array([[7.90000000e+01, 1.43759500e+04, 2.74545455e+00, ...,
        0.00000000e+00, 4.02083333e+01, 1.00000000e+00],
       [6.30000000e+01, 6.42928000e+03, 2.69285714e+00, ...,
        0.00000000e+00, 3.98600000e+01, 1.00000000e+00],
       [6.80000000e+01, 1.23318200e+04, 2.82000000e+00, ...,
        6.30000000e+00, 4.08562500e+01, 1.00000000e+00],
       ...,
       [6.00000000e+01, 4.99112000e+03, 2.74285714e+00, ...,
        5.90000000e+00, 3.97133333e+01, 1.00000000e+00],
       [8.30000000e+01, 1.13035150e+05, 2.72222222e+00, ...,
        4.60000000e+00, 3.95583333e+01, 1.00000000e+00],
       [6.20000000e+01, 5.76112000e+03, 2.73750000e+00, ...,
        0.00000000e+00, 3.93823529e+01, 1.00000000e+00]])

In [551]:
features = np.where(fs.scores_>0.05)[0]

In [552]:
features

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [553]:
def param_Search_CV(X_train, y_train):
    estimator = DecisionTreeRegressor()
    param_grid = { 
            "max_features" : [10, 20, 30, 40],
            "max_depth"     : [3, 4, 6, None],
           'min_samples_leaf':[5, 8, 12, 16, 20],
            'criterion': ['mse', 'friedman_mse', 'mae'],
           'min_samples_split': [8, 15, 25, 50, 75, 100, 150],
            }

    grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=5)
    grid.fit(X_train, y_train)
    return grid.best_score_ , grid.best_params_

In [554]:
param_Search_CV(X_train, y_train)

(0.5539656154720897,
 {'min_samples_split': 75,
  'min_samples_leaf': 12,
  'max_features': 10,
  'max_depth': 6,
  'criterion': 'mse'})

(0.5614889885193556,

 {'min_samples_split': 100,
 
  'min_samples_leaf': 12,
  
  'max_features': 40,
  
  'max_depth': 6,
  
  'criterion': 'mse'})

In [557]:
X = df_training_table.iloc[:, :-1].values
y = df_training_table.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_test, y_test))
reg1 = Lasso(alpha=0.01).fit(X_train, y_train)
print(reg1.score(X_test, y_test))
reg2 = Ridge(alpha=10).fit(X_train, y_train)
print(reg2.score(X_test, y_test))
reg3 = RandomForestRegressor(n_estimators=100, max_depth=10, max_samples=100).fit(X_train, y_train)
print(reg3.score(X_test, y_test))
reg4 = DecisionTreeRegressor(min_samples_split=75, min_samples_leaf=12,
                             max_depth=6, criterion='mse').fit(X_train, y_train)
print(reg4.score(X_test, y_test))
from sklearn.metrics import mean_squared_error
print(mean_squared_error((reg1.predict(X_test) + reg2.predict(X_test) + reg3.predict(X_test))/3, y_test))
print(mean_squared_error(reg1.predict(X_test), y_test))
print(mean_squared_error(reg2.predict(X_test), y_test))
print(mean_squared_error(reg3.predict(X_test), y_test))
print(mean_squared_error(reg4.predict(X_test), y_test))

0.13504715236998543
0.1349797959661767
0.1351584818666045
0.17853469620850346
0.21415091504912254
6.170587366836353
6.345257263349153
6.343946533261856
6.02576524937272
5.764506529359477


# Export GraphVis representation

In [485]:
obs_feature_codes = df_training_table.columns[features][2:24].values

In [486]:
obs_features = ddf_observations[ddf_observations['CODE'].isin(obs_feature_codes)][['CODE', 'DESCRIPTION']].drop_duplicates().compute()

In [487]:
obs_features = df_training_table.columns[features][2:24].map(obs_features.groupby('CODE').agg('first').to_dict()['DESCRIPTION'])

In [488]:
condition_features = df_training_table.columns[features][24:].values

In [489]:
demographic_features = df_training_table.columns[features][:2].values

In [490]:
feature_labels = np.concatenate([demographic_features, obs_features, condition_features])

In [491]:
from sklearn.tree import export_graphviz
export_graphviz(
    reg4,
    out_file =  "tree.dot",
    feature_names = feature_labels,
    filled = True,
    rounded = True)

# Create Predictions for Test Set

In [590]:
loaded_icu_classification_model = pickle.load(open('finalized_icu_classification_model.sav', 'rb'))

In [559]:
ddf_patients = dd.read_csv('../data/raw/test/patients.csv')
ddf_procedures = dd.read_csv('../data/raw/test/procedures.csv')
ddf_conditions = dd.read_csv('../data/raw/test/conditions.csv')
ddf_encounters = dd.read_csv('../data/raw/test/encounters.csv')
ddf_careplans = dd.read_csv('../data/raw/test/careplans.csv')
ddf_observations = dd.read_csv('../data/raw/test/observations.csv', dtype=str)
ddf_alergies = dd.read_csv('../data/raw/test/observations.csv')

In [649]:
covid_predictions = pd.read_csv('../data/processed/COVID_19 Status submission.csv')
len(covid_predictions)

29492

In [674]:
covid_patients_ids = covid_predictions[covid_predictions['predicted_proba'] > 0.825]['Id'].values

# Create Condition Features

In [656]:
conditions_subset = [65710008, 69896004, 195967001]
covid_patient_conditions = ddf_conditions[ddf_conditions.PATIENT.isin(covid_patients_ids)]
condition_dictionary = {code: covid_patient_conditions[covid_patient_conditions['CODE']==code]['PATIENT'].unique().compute() for code in conditions_subset}
condition_description = pd.DataFrame(ddf_conditions['DESCRIPTION'].values.compute(), index=ddf_conditions['CODE'].values.compute()).to_dict()[0]

In [657]:
df_features = create_condition_feature_table(conditions_subset)
for x in combinations(range(8), 2):
    df_features[f'comorbidity_{x}'] = df_features.iloc[:, x[0]]*df_features.iloc[:, x[1]]

# Create Procedure Features

In [624]:
df = ddf_procedures[['PATIENT', 'CODE']].compute()

In [625]:
codes = df['CODE'].value_counts()[df['CODE'].value_counts()>1000].index

In [626]:
df['Value'] = 1

In [627]:
df = df[df['CODE'].isin(codes)].pivot_table(index='PATIENT', columns='CODE', values='Value', aggfunc='mean').fillna(0)

In [628]:
df = df[df.index.isin(covid_patients_ids)]

In [629]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Observation Features

In [630]:
df = ddf_observations[['PATIENT', 'CODE', 'VALUE']].compute()
df = df[df['PATIENT'].isin(covid_patients_ids)]
df = df[df['VALUE'].map(numeric_like)]
feature_codes = ['14804-9', '1960-4', '1988-5', '19994-3', '2019-8', '2157-6',
       '2160-0', '2276-4', '2703-7', '2708-6', '2744-1', '2885-2',
       '33959-8', '4548-4', '48065-7', '5902-2', '6301-6', '704-7',
       '711-2', '731-0', '8310-5', '89579-7']
df = df[df['CODE'].isin(feature_codes)]
df['VALUE'] = df['VALUE'].astype(float)
df = df.pivot_table(index='PATIENT', columns='CODE', values='VALUE', aggfunc='mean').fillna(0)

In [631]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Demographic Features

In [632]:
temp = (np.datetime64('NOW') - ddf_patients['BIRTHDATE'].astype('M8[D]').compute()).astype('timedelta64[Y]').astype(float)
df_age = pd.DataFrame(temp.values, index = ddf_patients.compute().Id)
df_age.columns = ['AGE']

In [633]:
df = ddf_patients[ddf_patients['Id'].isin(covid_patients_ids)][['Id', 'GENDER', 'HEALTHCARE_COVERAGE']].compute()

In [634]:
df['GENDER'] = df['GENDER'].map(lambda x: int(x=='M'))
df.index = df['Id']

In [635]:
df_features = pd.merge(df.iloc[:, 1:], df_features, left_index=True, right_index=True)

In [636]:
df_features = pd.merge(df_age, df_features, left_index=True, right_index=True)

In [637]:
A = ['AGE', 'HEALTHCARE_COVERAGE', '2160-0', '2708-6', '2885-2', '4548-4', '8310-5', 'Acute respiratory failure (disorder)']

In [638]:
X = df_features[['AGE', 'HEALTHCARE_COVERAGE', '2160-0', '2708-6', '2885-2', '4548-4', '8310-5', 'Acute respiratory failure (disorder)']].values

In [646]:
loaded_icu_classification_model.predict_proba(X)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [640]:
pd.Series(reg4.predict(X)).value_counts()

3.744282    6414
5.940154       1
5.945808       1
2.582769       1
dtype: int64

# Conclusion

Our icu screening model seems to think that no one will be admitted to the ICU.  From the train set, we see that ~6% of patients with Covid are admited.  For the purpose of submission, we will take the most likely covid patients and predict that they will be admitted to the ICU and assign a total days admitted equal to 3.74, which our model seems to predict most frequently.  The reason for such bad performance on this set is due to the lack of features available for modelling in the test set.

In [680]:
test_patient_ids = pd.DataFrame(ddf_patients['Id'].compute())
test_patient_ids

Unnamed: 0,Id
0,8c6f61a6-1beb-4b51-980e-381722ca7e33
1,6732e6fc-f8f3-4b1a-924a-47f68f19526c
2,4772a573-1fb1-49ec-be86-dc2c4b176052
3,edb75aa7-7b67-48ce-9990-0095cd7a3b8d
4,e28f36cd-c8d8-4c64-94e7-91c89d1be9d1
...,...
29487,92c45fac-4d7b-4f2c-beaa-c8a3478d5874
29488,3fbae55d-c18e-4c84-88fd-56357953cfea
29489,80c8ab65-5f76-40cf-bbfa-4c42807b21a2
29490,b2eb252e-fb3d-4f45-aeb7-dcfa5fa90057


In [681]:
test_patient_ids['ICU_DAYS'] = (test_patient_ids['Id'].isin(covid_patients_ids))*3.74

In [684]:
test_patient_ids.set_index('Id', inplace=True)

In [685]:
test_patient_ids

Unnamed: 0_level_0,ICU_DAYS
Id,Unnamed: 1_level_1
8c6f61a6-1beb-4b51-980e-381722ca7e33,0.00
6732e6fc-f8f3-4b1a-924a-47f68f19526c,0.00
4772a573-1fb1-49ec-be86-dc2c4b176052,0.00
edb75aa7-7b67-48ce-9990-0095cd7a3b8d,0.00
e28f36cd-c8d8-4c64-94e7-91c89d1be9d1,0.00
...,...
92c45fac-4d7b-4f2c-beaa-c8a3478d5874,0.00
3fbae55d-c18e-4c84-88fd-56357953cfea,0.00
80c8ab65-5f76-40cf-bbfa-4c42807b21a2,0.00
b2eb252e-fb3d-4f45-aeb7-dcfa5fa90057,0.00


In [None]:
with open("out.csv", mode='w', newline='\n') as f:
    test_patient_ids.to_csv(f, sep=",", line_terminator='\n', encoding='utf-8', header=False)