In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter # To adjust the ticks formatting
import warnings
warnings.filterwarnings('ignore') # To ignore warning messages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import random

# Adding filters to dataset

### Full patient data

In [2]:
df = pd.read_csv('fulldata_clean.csv')

### Covid data

In [3]:
df_covid = df[(df['Covid_or_Not'] == "A Covid 19 Carrier")]
df_covid

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
0,2,1,1,1,2.0,1.0,65.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
2,2,1,2,2,1.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
4,2,1,2,1,2.0,2.0,68.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,1,A Covid 19 Carrier
5,2,1,1,2,2.0,1.0,40.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
6,2,1,1,1,2.0,2.0,64.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018888,1,13,1,2,2.0,2.0,77.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018889,1,13,1,1,2.0,2.0,55.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018890,1,13,2,1,2.0,2.0,70.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier
1018891,2,13,2,1,2.0,2.0,32.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0,0,A Covid 19 Carrier


In [4]:
df_covid.to_csv('coviddata_clean.csv',index = False)

### Covid data with icu

In [5]:
df_covid_icu = df_covid[(df_covid['ICU'] == 1)]

In [6]:
df_covid_icu

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DEAD,Covid_or_Not
16,2,1,1,2,2.0,1.0,80.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
20,2,1,2,2,2.0,1.0,59.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,3,1.0,0,A Covid 19 Carrier
41,2,1,2,2,1.0,1.0,45.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
319,1,3,2,2,1.0,1.0,90.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
320,1,3,2,2,1.0,1.0,61.0,2.0,2.0,2.0,...,1.0,2.0,2.0,1.0,2.0,2.0,3,1.0,1,A Covid 19 Carrier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018691,1,13,2,2,1.0,1.0,55.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018693,1,13,2,2,2.0,2.0,56.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018696,1,13,2,2,2.0,1.0,68.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier
1018706,1,13,2,2,1.0,2.0,35.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0,0,A Covid 19 Carrier


In [7]:
df_covid_icu.to_csv('coviddata_icu_clean.csv',index = False)

# Data Split

In [68]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [69]:
df = pd.read_csv('Hospitalized Data/hospitalized_converted.csv')

In [70]:
df = df.drop(['DEAD'],axis = 1)

In [71]:
df['ICU'] = [0 if i == 2 else 1 for i in df.ICU]

In [72]:
df = df.drop(['PATIENT_TYPE'],axis = 1)

In [73]:
label = df['ICU']
df = df.drop(['ICU'],axis = 1)

# one hot

In [74]:
columns_to_encode = ['USMER', 'MEDICAL_UNIT','SEX','INTUBED','PNEUMONIA','PREGNANT','DIABETES','COPD','ASTHMA', 'INMSUPR', 'HIPERTENSION',
       'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC','TOBACCO', 'CLASIFFICATION_FINAL', 'Covid_or_Not']

In [75]:
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_columns = encoder.fit_transform(df[columns_to_encode])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(columns_to_encode))



In [76]:
df_without = df.drop(columns = columns_to_encode)

In [77]:
df_without

Unnamed: 0,AGE
0,55.0
1,40.0
2,37.0
3,25.0
4,24.0
...,...
189107,61.0
189108,63.0
189109,23.0
189110,56.0


In [78]:
scaler = StandardScaler()
scaler.fit(df_without)
df_encoded_norm = scaler.transform(df_without)
df_encoded_norm

array([[ 0.09801362],
       [-0.67590477],
       [-0.83068845],
       ...,
       [-1.55301228],
       [ 0.14960818],
       [-0.10836462]])

In [80]:
df_part1 = pd.DataFrame(df_encoded_norm, columns = ['AGE'])

In [84]:
df_encoded=pd.concat([df_part1, encoded_df,label], axis=1)

In [85]:
df_encoded

Unnamed: 0,AGE,USMER_2,MEDICAL_UNIT_2,MEDICAL_UNIT_3,MEDICAL_UNIT_4,MEDICAL_UNIT_5,MEDICAL_UNIT_6,MEDICAL_UNIT_7,MEDICAL_UNIT_8,MEDICAL_UNIT_9,...,RENAL_CHRONIC_2.0,TOBACCO_2.0,CLASIFFICATION_FINAL_2,CLASIFFICATION_FINAL_3,CLASIFFICATION_FINAL_4,CLASIFFICATION_FINAL_5,CLASIFFICATION_FINAL_6,CLASIFFICATION_FINAL_7,Covid_or_Not_2,ICU
0,0.098014,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
1,-0.675905,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,-0.830688,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,-1.449823,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,-1.501418,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189107,0.407581,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
189108,0.510770,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
189109,-1.553012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
189110,0.149608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0


In [86]:
df_encoded.to_csv('hospitalized_processed.csv',index = False)

In [87]:
df = df_encoded.copy()

### split covid icu into data shards

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
X = df.drop(['ICU'],axis=1)
Y = df['ICU']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
A_X,B_X,A_Y,B_Y = train_test_split(X_train, Y_train, test_size=0.5, random_state=42)
C_X,D_X,C_Y,D_Y = train_test_split(X_test, Y_test, test_size=0.5, random_state=42)

In [91]:
def reset_index(dataframe):
    dataframe = dataframe.reset_index()
    dataframe = dataframe.drop(['index'],axis = 1)
    return dataframe

In [92]:
A_X =  reset_index(A_X)
B_X =  reset_index(B_X)
C_X =  reset_index(C_X)
D_X =  reset_index(D_X)

A_Y =  reset_index(A_Y)
B_Y =  reset_index(B_Y)
C_Y =  reset_index(C_Y)
D_Y =  reset_index(D_Y)

In [93]:
shardA = pd.concat([A_X,A_Y],axis = 1)
shardB = pd.concat([B_X,B_Y],axis = 1)
shardC = pd.concat([C_X,C_Y],axis = 1)
shardD = pd.concat([D_X,D_Y],axis = 1)

In [94]:
shardA_train, shardA_test = train_test_split(shardA, test_size = 0.3,random_state=42)
shardB_train, shardB_test = train_test_split(shardB, test_size = 0.3,random_state=42)
shardC_train, shardC_test = train_test_split(shardC, test_size = 0.3,random_state=42)
shardD_train, shardD_test = train_test_split(shardD, test_size = 0.3,random_state=42)

In [95]:
shardA_train.to_csv('cle_train.csv',index = False)
shardA_test.to_csv('cle_test.csv',index = False)

shardB_train.to_csv('hun_train.csv',index = False)
shardB_test.to_csv('hun_test.csv',index = False)

shardC_train.to_csv('swi_train.csv',index = False)
shardC_test.to_csv('swi_test.csv',index = False)

shardD_train.to_csv('vir_train.csv',index = False)
shardD_test.to_csv('vir_test.csv',index = False)

# Combined train and test data

In [96]:
df1 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/cle_train.csv')
df2 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/hun_train.csv')
df3 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/swi_train.csv')
df4 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/vir_train.csv')
com_train = pd.concat([df1, df2, df3, df4], ignore_index=True)
com_train = com_train.reset_index(drop=True)
com_train.to_csv('../Covid-19_Mortality_Rate/TrainTestData/com_train.csv', index = False)

In [97]:
df1 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/cle_test.csv')
df2 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/hun_test.csv')
df3 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/swi_test.csv')
df4 = pd.read_csv('../Covid-19_Mortality_Rate/TrainTestData/vir_test.csv')
com_test = pd.concat([df1, df2, df3, df4], ignore_index=True)
com_test = com_test.reset_index(drop=True)
com_test.to_csv('../Covid-19_Mortality_Rate/TrainTestData/com_test.csv', index = False)