In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter # To adjust the ticks formatting
import warnings
warnings.filterwarnings('ignore') # To ignore warning messages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import random

# Adding filters to dataset

### Full patient data

In [2]:
df = pd.read_csv('fulldata_clean.csv')

### Covid data

In [3]:
df_covid = df[(df['Covid_or_Not'] == "A Covid 19 Carrier")]
df_covid

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
0,2,1,1,1,2.0,1.0,65.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
2,2,1,2,2,1.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
4,2,1,2,1,2.0,2.0,68.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
5,2,1,1,2,2.0,1.0,40.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
6,2,1,1,1,2.0,2.0,64.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018888,1,13,1,2,2.0,2.0,77.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018889,1,13,1,1,2.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018890,1,13,2,1,2.0,2.0,70.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018891,2,13,2,1,2.0,2.0,32.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier


In [4]:
df_covid.to_csv('coviddata_clean.csv',index = False)

### Covid data with icu

In [5]:
df_covid_icu = df_covid[(df_covid['ICU'] == 1)]

In [6]:
df_covid_icu

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
16,2,1,1,2,2.0,1.0,80.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
20,2,1,2,2,2.0,1.0,59.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,3,1.0,0,A Covid 19 Carrier
41,2,1,2,2,1.0,1.0,45.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
319,1,3,2,2,1.0,1.0,90.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
320,1,3,2,2,1.0,1.0,61.0,2.0,2.0,2.0,...,1.0,2.0,2.0,1.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018691,1,13,2,2,1.0,1.0,55.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018693,1,13,2,2,2.0,2.0,56.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018696,1,13,2,2,2.0,1.0,68.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018706,1,13,2,2,1.0,2.0,35.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier


In [7]:
df_covid_icu.to_csv('coviddata_icu_clean.csv',index = False)

# Data Split

### Split full dataset for traintest

In [8]:
X = df.drop(['DEAD'],axis=1)
Y = df['DEAD']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [10]:
Y_train.value_counts()

0    633728
1     49233
Name: DEAD, dtype: int64

In [11]:
Y_test.value_counts()

0    312408
1     23976
Name: DEAD, dtype: int64

In [12]:
X_train.to_csv('fulldata_xtrain.csv',index = False)
X_test.to_csv('fulldata_xtest.csv',index = False)
Y_train.to_csv('fulldata_ytrain.csv',index = False)
Y_test.to_csv('fulldata_ytest.csv',index = False)

### Split full dataset into data shards

In [13]:
X = df.drop(['DEAD'],axis=1)
Y = df['DEAD']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

In [15]:
Y_train.value_counts()

0    473029
1     36643
Name: DEAD, dtype: int64

In [16]:
Y_test.value_counts()

0    473107
1     36566
Name: DEAD, dtype: int64

In [17]:
A_train,B_train,A_test,B_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)

In [18]:
A_test.value_counts()

0    236534
1     18302
Name: DEAD, dtype: int64

In [19]:
B_test.value_counts()

0    236495
1     18341
Name: DEAD, dtype: int64

In [20]:
C_train,D_train,C_test,D_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)

In [21]:
C_test.value_counts()

0    236534
1     18302
Name: DEAD, dtype: int64

In [22]:
D_test.value_counts()

0    236495
1     18341
Name: DEAD, dtype: int64

In [23]:
A_train.to_csv('full_A_train.csv',index=False)
A_train.to_csv('full_A_train.csv',index=False)

B_train.to_csv('full_B_train.csv',index=False)
B_train.to_csv('full_B_train.csv',index=False)

C_train.to_csv('full_C_train.csv',index=False)
C_train.to_csv('full_C_train.csv',index=False)

D_train.to_csv('full_D_train.csv',index=False)
D_train.to_csv('full_D_train.csv',index=False)

### Splite Covid data into train test

In [24]:
X = df_covid.drop(['DEAD'],axis=1)
Y = df_covid['DEAD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

X_train.to_csv('coviddata_xtrain.csv',index = False)
X_test.to_csv('coviddata_xtest.csv',index = False)
Y_train.to_csv('coviddata_ytrain.csv',index = False)
Y_test.to_csv('coviddata_ytest.csv',index = False)

In [25]:
Y_train.value_counts()

0    223673
1     34930
Name: DEAD, dtype: int64

In [26]:
Y_test.value_counts()

0    110219
1     17153
Name: DEAD, dtype: int64

### Split covid data into data shards

In [27]:
X = df_covid.drop(['DEAD'],axis=1)
Y = df_covid['DEAD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
A_train,B_train,A_test,B_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
C_train,D_train,C_test,D_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
print(A_test.value_counts())
print(B_test.value_counts())
print(C_test.value_counts())
print(D_test.value_counts())

A_train.to_csv('covid_A_train.csv',index=False)
A_train.to_csv('covid_A_train.csv',index=False)

B_train.to_csv('covid_B_train.csv',index=False)
B_train.to_csv('covid_B_train.csv',index=False)

C_train.to_csv('covid_C_train.csv',index=False)
C_train.to_csv('covid_C_train.csv',index=False)

D_train.to_csv('covid_D_train.csv',index=False)
D_train.to_csv('covid_D_train.csv',index=False)

0    83455
1    13038
Name: DEAD, dtype: int64
0    83401
1    13093
Name: DEAD, dtype: int64
0    83455
1    13038
Name: DEAD, dtype: int64
0    83401
1    13093
Name: DEAD, dtype: int64


# Split COVID ICU dataset into train test

In [28]:
X = df_covid_icu.drop(['DEAD'],axis=1)
Y = df_covid_icu['DEAD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

X_train.to_csv('covidicudata_xtrain.csv',index = False)
X_test.to_csv('covidicudata_xtest.csv',index = False)
Y_train.to_csv('covidicudata_ytrain.csv',index = False)
Y_test.to_csv('covidicudata_ytest.csv',index = False)

### split covid icu into data shards

In [29]:
X = df_covid_icu.drop(['DEAD'],axis=1)
Y = df_covid_icu['DEAD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
A_train,B_train,A_test,B_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
C_train,D_train,C_test,D_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
print(A_test.value_counts())
print(B_test.value_counts())
print(C_test.value_counts())
print(D_test.value_counts())

A_train.to_csv('covidicu_A_train.csv',index=False)
A_train.to_csv('covidicu_A_train.csv',index=False)

B_train.to_csv('covidicu_B_train.csv',index=False)
B_train.to_csv('covidicu_B_train.csv',index=False)

C_train.to_csv('covidicu_C_train.csv',index=False)
C_train.to_csv('covidicu_C_train.csv',index=False)

D_train.to_csv('covidicu_D_train.csv',index=False)
D_train.to_csv('covidicu_D_train.csv',index=False)

1    1477
0    1073
Name: DEAD, dtype: int64
1    1426
0    1124
Name: DEAD, dtype: int64
1    1477
0    1073
Name: DEAD, dtype: int64
1    1426
0    1124
Name: DEAD, dtype: int64


# Split Hospitalized dataset into train test

In [30]:
df = pd.read_csv('hospitalized.csv')

In [28]:
X = df.drop(['ICU'],axis=1)
Y = df['ICU']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

X_train.to_csv('hosplitalized_xtrain.csv',index = False)
X_test.to_csv('hosplitalized_xtest.csv',index = False)
Y_train.to_csv('hosplitalized_ytrain.csv',index = False)
Y_test.to_csv('covidicudata_ytest.csv',index = False)

### split covid icu into data shards

In [29]:
X = df_covid_icu.drop(['DEAD'],axis=1)
Y = df_covid_icu['DEAD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
A_train,B_train,A_test,B_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
C_train,D_train,C_test,D_test = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
print(A_test.value_counts())
print(B_test.value_counts())
print(C_test.value_counts())
print(D_test.value_counts())

A_train.to_csv('covidicu_A_train.csv',index=False)
A_train.to_csv('covidicu_A_train.csv',index=False)

B_train.to_csv('covidicu_B_train.csv',index=False)
B_train.to_csv('covidicu_B_train.csv',index=False)

C_train.to_csv('covidicu_C_train.csv',index=False)
C_train.to_csv('covidicu_C_train.csv',index=False)

D_train.to_csv('covidicu_D_train.csv',index=False)
D_train.to_csv('covidicu_D_train.csv',index=False)

1    1477
0    1073
Name: DEAD, dtype: int64
1    1426
0    1124
Name: DEAD, dtype: int64
1    1477
0    1073
Name: DEAD, dtype: int64
1    1426
0    1124
Name: DEAD, dtype: int64
