In [60]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from itertools import combinations
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
import pickle

In [107]:
ddf_patients = dd.read_csv('../data/raw/train/patients.csv')
ddf_procedures = dd.read_csv('../data/raw/train/procedures.csv')
ddf_conditions = dd.read_csv('../data/raw/train/conditions.csv')
ddf_encounters = dd.read_csv('../data/raw/train/encounters.csv')
ddf_careplans = dd.read_csv('../data/raw/train/careplans.csv')
ddf_observations = dd.read_csv('../data/raw/train/observations.csv', dtype=str)
ddf_alergies = dd.read_csv('../data/raw/train/observations.csv')

In [108]:
covid_patients_ids = ddf_conditions[ddf_conditions.CODE == 840539006].PATIENT.unique().compute()

icu_ids = ddf_encounters[
    (ddf_encounters.CODE == 305351004) & 
    (ddf_encounters.PATIENT.isin(covid_patients_ids))].PATIENT.compute()

icu_target = pd.DataFrame(covid_patients_ids.isin(icu_ids).values.astype(int), index=covid_patients_ids)
icu_target.columns=['Target']
icu_target

Unnamed: 0_level_0,Target
PATIENT,Unnamed: 1_level_1
afa2680f-7f73-46d9-b0cd-2cf3db49724b,0
26e5d262-6cdc-4274-a5cd-d7fd439e35f9,0
90031c21-e8a4-416e-b60b-23e871ee62dc,0
0ac2e031-ab88-41f9-ac5d-d7d588952222,0
45138fb4-dc79-4aec-ba78-7a012d3edca7,0
...,...
43d96255-e3ab-43af-97e6-494681105115,0
12f7b700-0f7a-4e09-8adf-7d7f9c0c2fb0,0
8722685f-ce37-4520-ab95-22b6929526c2,1
62e3e406-8375-4cf5-aae7-0d489e40e73c,0


In [109]:
def numeric_like(x):
    try:
        float(x)
        return True
    except:
        return False

# Create Condition Features

In [110]:
conditions_subset = [65710008, 69896004, 195967001]
covid_patient_conditions = ddf_conditions[ddf_conditions.PATIENT.isin(covid_patients_ids)]
condition_dictionary = {code: covid_patient_conditions[covid_patient_conditions['CODE']==code]['PATIENT'].unique().compute() for code in conditions_subset}
condition_description = pd.DataFrame(ddf_conditions['DESCRIPTION'].values.compute(), index=ddf_conditions['CODE'].values.compute()).to_dict()[0]

In [140]:
condition_description[195967001]

'Asthma'

In [111]:
def create_feature_dictionary(ddf, patient_subset, column, value_subset):
    patient_feature = ddf[ddf.PATIENT.isin(patient_subset)]
    patient_feature_dictionary = {code: patient_feature[patient_feature[column]==value]['PATIENT'].unique().compute() for value in value_subset}
    return patient_feature_dictionary

def create_binary_feature(ddf, patient_feature_dictionary, value):
    temp1 = patient_feature_dictionary[value]
    temp2 = ddf[~ddf.PATIENT.isin(patient_feature_dictionary[value].values)].PATIENT.unique().compute()
    df1 = pd.DataFrame(np.ones(len(temp1)).astype(int), index=temp1)
    df2 = pd.DataFrame(np.zeros(len(temp2)).astype(int), index=temp2)
    df = pd.concat([df1, df2])
    df.columns = [condition_description[condition]]
    return df

In [112]:
df_features = create_condition_feature_table(conditions_subset)
for x in combinations(range(8), 2):
    df_features[f'comorbidity_{x}'] = df_features.iloc[:, x[0]]*df_features.iloc[:, x[1]]

# Create Procedure Features

In [113]:
df = ddf_procedures[['PATIENT', 'CODE']].compute()

In [114]:
codes = df['CODE'].value_counts()[df['CODE'].value_counts()>1000].index

In [115]:
df['Value'] = 1

In [116]:
df = df[df['CODE'].isin(codes)].pivot_table(index='PATIENT', columns='CODE', values='Value', aggfunc='mean').fillna(0)

In [117]:
df = df[df.index.isin(covid_patients_ids)]

In [118]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Observation Features

In [119]:
df = ddf_observations[['PATIENT', 'CODE', 'VALUE']].compute()
df = df[df['PATIENT'].isin(covid_patients_ids)]
df = df[df['VALUE'].map(numeric_like)]
feature_codes = ['14804-9', '1960-4', '1988-5', '19994-3', '2019-8', '2157-6',
       '2160-0', '2276-4', '2703-7', '2708-6', '2744-1', '2885-2',
       '33959-8', '4548-4', '48065-7', '5902-2', '6301-6', '704-7',
       '711-2', '731-0', '8310-5', '89579-7']
df = df[df['CODE'].isin(feature_codes)]
df['VALUE'] = df['VALUE'].astype(float)
df = df.pivot_table(index='PATIENT', columns='CODE', values='VALUE', aggfunc='mean').fillna(0)

In [120]:
df_features = pd.merge(df, df_features, left_index=True, right_index=True)

# Create Demographic Features

In [121]:
temp = (np.datetime64('NOW') - ddf_patients['BIRTHDATE'].astype('M8[D]').compute()).astype('timedelta64[Y]').astype(float)
df_age = pd.DataFrame(temp.values, index = ddf_patients.compute().Id)
df_age.columns = ['AGE']

In [122]:
df = ddf_patients[ddf_patients['Id'].isin(covid_patients_ids)][['Id', 'GENDER', 'HEALTHCARE_COVERAGE']].compute()

In [123]:
df['GENDER'] = df['GENDER'].map(lambda x: int(x=='M'))
df.index = df['Id']

In [124]:
df_features = pd.merge(df.iloc[:, 1:], df_features, left_index=True, right_index=True)

In [125]:
df_features = pd.merge(df_age, df_features, left_index=True, right_index=True)

In [149]:
df_training_table = pd.merge(df_features, icu_target, how='outer', left_index=True, right_index=True).fillna(0)

In [151]:
df_training_table = df_training_table[['AGE',
 'HEALTHCARE_COVERAGE',
 '2160-0',
 '2708-6',
 '2885-2',
 '4548-4',
 '8310-5',
 'Acute respiratory failure (disorder)', 'Target']]

# Modelling

In [174]:
X = df_training_table.iloc[:, :-1].values
y = df_training_table.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

reg = DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=10, max_depth=5).fit(X_train, y_train)
print(reg.score(X_test, y_test))
print(f1_score(reg.predict(X_test), y_test))
print(matthews_corrcoef(reg.predict(X_test), y_test))

0.9515603799185889
0.6063947078280044
0.5870232407622129


In [175]:
filename = 'finalized_icu_classification_model.sav'
pickle.dump(reg, open(filename, 'wb'))

4981

In [176]:
len(icu_ids)

4981

In [178]:
len(icu_ids)/len(covid_patients_ids)

0.06758755444590689