In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter # To adjust the ticks formatting
import warnings
warnings.filterwarnings('ignore') # To ignore warning messages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import random

# Adding filters to dataset

### Full patient data

In [2]:
df = pd.read_csv('fulldata_clean.csv')

### Covid data

In [3]:
df_covid = df[(df['Covid_or_Not'] == "A Covid 19 Carrier")]
df_covid

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
0,2,1,1,1,2.0,1.0,65.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
2,2,1,2,2,1.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
4,2,1,2,1,2.0,2.0,68.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
5,2,1,1,2,2.0,1.0,40.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
6,2,1,1,1,2.0,2.0,64.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018888,1,13,1,2,2.0,2.0,77.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018889,1,13,1,1,2.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018890,1,13,2,1,2.0,2.0,70.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018891,2,13,2,1,2.0,2.0,32.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier


In [4]:
df_covid.to_csv('coviddata_clean.csv',index = False)

### Covid data with icu

In [5]:
df_covid_icu = df_covid[(df_covid['ICU'] == 1)]

In [6]:
df_covid_icu

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
16,2,1,1,2,2.0,1.0,80.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
20,2,1,2,2,2.0,1.0,59.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,3,1.0,0,A Covid 19 Carrier
41,2,1,2,2,1.0,1.0,45.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
319,1,3,2,2,1.0,1.0,90.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
320,1,3,2,2,1.0,1.0,61.0,2.0,2.0,2.0,...,1.0,2.0,2.0,1.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018691,1,13,2,2,1.0,1.0,55.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018693,1,13,2,2,2.0,2.0,56.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018696,1,13,2,2,2.0,1.0,68.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018706,1,13,2,2,1.0,2.0,35.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier


In [7]:
df_covid_icu.to_csv('coviddata_icu_clean.csv',index = False)

# Data Split

# Split Converted Hospitalized dataset into train test

In [4]:
df = pd.read_csv('Hospitalized Data/hospitalized_converted.csv')

In [12]:
df = df.drop(['DEAD'],axis = 1)

In [14]:
df['ICU'] = [0 if i == 2 else 1 for i in df.ICU]

In [17]:
df.to_csv('hospitalized_converted.csv',index = False)

In [18]:
X = df.drop(['ICU'],axis=1)
Y = df['ICU']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

X_train.to_csv('hospitalized_coverted_xtrain.csv',index = False)
X_test.to_csv('hospitalized_coverted_xtest.csv',index = False)
Y_train.to_csv('hospitalized_coverted_ytrain.csv',index = False)
Y_test.to_csv('hospitalized_coverted_ytest.csv',index = False)

### split covid icu into data shards

In [34]:
X = df.drop(['ICU'],axis=1)
Y = df['ICU']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
A_X,B_X,A_Y,B_Y = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
C_X,D_X,C_Y,D_Y = train_test_split(X_test, Y_test, test_size=0.5, random_state=42)

In [35]:
def reset_index(dataframe):
    dataframe = dataframe.reset_index()
    dataframe = dataframe.drop(['index'],axis = 1)
    return dataframe

In [36]:
A_X =  reset_index(A_X)
B_X =  reset_index(B_X)
C_X =  reset_index(C_X)
D_X =  reset_index(D_X)

A_Y =  reset_index(A_Y)
B_Y =  reset_index(B_Y)
C_Y =  reset_index(C_Y)
D_Y =  reset_index(D_Y)

In [40]:
shardA = pd.concat([A_X,A_Y],axis = 1)
shardB = pd.concat([B_X,B_Y],axis = 1)
shardC = pd.concat([C_X,C_Y],axis = 1)
shardD = pd.concat([D_X,D_Y],axis = 1)

In [41]:
#partially converted shard A: Since there are Death outside hospitalized group, 
#                             here partially converted means we only convert the labels for the records that dead and hospitalized.
shardA.to_csv('partialcon_shardA.csv',index = False)
shardB.to_csv('partialcon_shardB.csv',index = False)
shardC.to_csv('partialcon_shardC.csv',index = False)
shardD.to_csv('partialcon_shardD.csv',index = False)

In [44]:
A_X_train, A_X_test, A_Y_train, A_Y_test = train_test_split(A_X, A_Y, test_size=0.33, random_state=42)
B_X_train, B_X_test, B_Y_train, B_Y_test = train_test_split(B_X, B_Y, test_size=0.33, random_state=42)
C_X_train, C_X_test, C_Y_train, C_Y_test = train_test_split(C_X, C_Y, test_size=0.33, random_state=42)
D_X_train, D_X_test, D_Y_train, D_Y_test = train_test_split(D_X, D_Y, test_size=0.33, random_state=42)

In [49]:
A_X_train.to_csv('partialcon_A_X_train.csv', index = False)
A_X_train.to_csv('partialcon_A_X_test.csv', index = False)
A_X_train.to_csv('partialcon_A_Y_train.csv', index = False)
A_X_train.to_csv('partialcon_A_Y_test.csv', index = False)

In [50]:
B_X_train.to_csv('partialcon_B_X_train.csv', index = False)
B_X_train.to_csv('partialcon_B_X_test.csv', index = False)
B_X_train.to_csv('partialcon_B_Y_train.csv', index = False)
B_X_train.to_csv('partialcon_B_Y_test.csv', index = False)

In [51]:
C_X_train.to_csv('partialcon_C_X_train.csv', index = False)
C_X_train.to_csv('partialcon_C_X_test.csv', index = False)
C_X_train.to_csv('partialcon_C_Y_train.csv', index = False)
C_X_train.to_csv('partialcon_C_Y_test.csv', index = False)

In [52]:
D_X_train.to_csv('partialcon_D_X_train.csv', index = False)
D_X_train.to_csv('partialcon_D_X_test.csv', index = False)
D_X_train.to_csv('partialcon_D_Y_train.csv', index = False)
D_X_train.to_csv('partialcon_D_Y_test.csv', index = False)