# COVID Severity Prediction using AI Solution - Zero-based transforming
* Sangwon Baek
* March 10th, 2023

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
df = pd.read_csv('../Data/Preprocessed/CRF_Preprocessed_Original.csv', low_memory=False)
df = df.drop(columns='Unnamed: 0')

In [12]:
# Convert into seven different sub dataframes into Initial (입원, 0-2), FU1(3-5), FU2(6-7), FU3(8-10), FU4(11-14), last(퇴원직전)
CommonInfo = ['diagnosis', 'No', 'ID', 'age', 'sex', 'symptom_date', 'dx_date', 'adm_date', 'UD_HT', 'UD_DM', 'UD_CVD', 'UD_cancer', 
              'UD_other', 'SMT_fever', 'SMT_cough', 'SMT_sputum', 'SMT_dyspnea', 'SMT_myalgia', 'SMT_sorethroat', 'SMT_mental', 
              'SMT_GI', 'steroid', 'O2sup', 'ventilator', 'ECMO', 'ICU_date', 'Mortality','discharge_date', 'Mild', 'Moderate', 
              'Severe', 'TX_0', 'TX_1', 'TX_2', 'TX_3', 'TX_4', 'Smoking_0', 'Smoking_1', 'Smoking_2', 'Smoking_3']
InitialCol = ['Initial_BT', 'Initial_SBP', 'Initial_DBP', 'Initial_PR', 'Initial_RR', 'Initial_SPO2', 'Initial_FIO2', 'Initial_CXR', 'Initial_WBC', 
              'Initial_ANC', 'Initial_ALC', 'Initial_PLT', 'Initial_CRP', 'Initial_LDH', 'Initial_DD', 'Initial_PCR']
FU1Col = ['FU1_BT', 'FU1_SBP', 'FU1_DBP', 'FU1_PR', 'FU1_RR', 'FU1_SPO2', 'FU1_FIO2', 'FU1_CXR', 'FU1_WBC', 'FU1_ANC', 
          'FU1_ALC', 'FU1_PLT', 'FU1_CRP', 'FU1_LDH', 'FU1_DD', 'FU1_PCR']
FU2Col = ['FU2_BT', 'FU2_SBP', 'FU2_DBP', 'FU2_PR', 'FU2_RR', 'FU2_SPO2', 'FU2_FIO2', 'FU2_CXR', 'FU2_WBC', 'FU2_ANC', 
          'FU2_ALC', 'FU2_PLT', 'FU2_CRP', 'FU2_LDH', 'FU2_DD', 'FU2_PCR']
FU3Col = ['FU3_BT', 'FU3_SBP', 'FU3_DBP', 'FU3_PR', 'FU3_RR', 'FU3_SPO2', 'FU3_FIO2', 'FU3_CXR', 'FU3_WBC', 'FU3_ANC', 
          'FU3_ALC', 'FU3_PLT', 'FU3_CRP', 'FU3_LDH', 'FU3_DD', 'FU3_PCR']
FU4Col = ['FU4_BT', 'FU4_SBP', 'FU4_DBP', 'FU4_PR', 'FU4_RR', 'FU4_SPO2', 'FU4_FIO2', 'FU4_CXR', 'FU4_WBC', 'FU4_ANC', 
          'FU4_ALC', 'FU4_PLT', 'FU4_CRP', 'FU4_LDH', 'FU4_DD', 'FU4_PCR']
LastCol = ['Last_BT', 'Last_SBP', 'Last_DBP', 'Last_PR', 'Last_RR', 'Last_SPO2', 'Last_FIO2', 'Last_CXR', 'Last_WBC', 
           'Last_ANC', 'Last_ALC', 'Last_PLT', 'Last_CRP', 'Last_LDH', 'Last_DD', 'Last_PCR']
Combined = ['BT', 'SBP', 'DBP', 'PR', 'RR', 'SPO2', 'FIO2', 'CXR', 'WBC', 'ANC', 'ALC', 'PLT', 'CRP', 'LDH', 'DD', 'PCR']
columnsToView = ['No', 'age', 'ID', 'symptom_date', 'dx_date', 'BT', 'SBP', 'DBP', 'PR', 'RR', 'SPO2', 'WBC', 
                 'ANC', 'ALC', 'PLT', 'CRP', 'LDH', 'DD', 'PCR', 'Mild', 'Moderate', 'Severe']
bloodCellCol = [['Initial_WBC', 'Initial_ANC', 'Initial_ALC'],
                ['FU1_WBC', 'FU1_ANC', 'FU1_ALC'],
                ['FU2_WBC', 'FU2_ANC', 'FU2_ALC'],
                ['FU3_WBC', 'FU3_ANC', 'FU3_ALC'],
                ['FU4_WBC', 'FU4_ANC', 'FU4_ALC'],
                ['Last_WBC', 'Last_ANC', 'Last_ALC']]
outcome = ['Mild', 'Moderate', 'Severe']
DefOutcome = ['ventilator', 'ECMO', 'ICU_date', 'Mortality']

### Functions for transformation

In [3]:
#If all WBC & ANC & ALC are available as the set, use that to impute for missing WBC, ANC, ALC. If not, don't do computation
def bloodCellTransform(df, bloodCellCol):
    idx = []

    for i in range (len(df)):
        for WBC, ANC, ALC in bloodCellCol:
            if (pd.notnull(df[WBC].iloc[i]) and pd.notnull(df[ANC].iloc[i]) and pd.notnull(df[ALC].iloc[i])) and (pd.isnull(df['WBC'].iloc[i]) or pd.isnull(df['ANC'].iloc[i]) or pd.isnull(df['ALC'].iloc[i])):
                df['WBC'].iloc[i] = df[WBC].iloc[i]
                df['ANC'].iloc[i] = df[ANC].iloc[i]
                df['ALC'].iloc[i] = df[ALC].iloc[i]
                idx.append(i)
    return df, idx

def df_transformation(df, dx_date, colNew, colInitial, colFU1, colFU2, colFU3, colFU4, colLast):
    for i in range (len(df)):
        for New, Initial, FU1, FU2, FU3, FU4, Last in zip(colNew, colInitial, colFU1, colFU2, colFU3, colFU4, colLast):
            #Apply trnasformation
            df[New].iloc[i] = df[Initial].iloc[i]     
            if pd.isnull(df[New].iloc[i])==True and ((New == 'LDH') or (New == 'DD') or (New == 'CRP') or (New == 'PLT')):
                if pd.isnull(df[FU1].iloc[i])==False:
                    df[New].iloc[i] = df[FU1].iloc[i]     
                elif pd.isnull(df[FU2].iloc[i])==False:
                    df[New].iloc[i] = df[FU2].iloc[i] 
    return df

def df_transformation_initial(df, dx_date, colNew, colInitial, colFU1, colFU2, colFU3, colFU4, colLast):
    for i in range (len(df)):
        for New, Initial, FU1, FU2, FU3, FU4, Last in zip(colNew, colInitial, colFU1, colFU2, colFU3, colFU4, colLast):
            #Apply trnasformation
            df[New].iloc[i] = df[Initial].iloc[i]
    return df

### Apply transformation to dataset

In [4]:
df[(df['dx_date'] >= -15) & (df['dx_date'] <= 1)]

Unnamed: 0,diagnosis,No,ID,age,sex,symptom_date,dx_date,adm_date,hospitalized_date,UD_HT,UD_DM,UD_CVD,UD_cancer,UD_other,SMT_fever,SMT_cough,SMT_sputum,SMT_dyspnea,SMT_myalgia,SMT_sorethroat,SMT_mental,SMT_GI,Initial_BT,Initial_SBP,Initial_DBP,Initial_PR,Initial_RR,Initial_SPO2,Initial_FIO2,Initial_CXR,Initial_CT,Initial_WBC,Initial_ANC,Initial_ALC,Initial_PLT,Initial_CRP,Initial_LDH,Initial_DD,FU1_BT,FU1_SBP,FU1_DBP,FU1_PR,FU1_RR,FU1_SPO2,FU1_FIO2,FU1_CXR,FU1_CT,FU1_WBC,FU1_ANC,FU1_ALC,FU1_PLT,FU1_CRP,FU1_LDH,FU1_DD,FU2_BT,FU2_SBP,FU2_DBP,FU2_PR,FU2_RR,FU2_SPO2,FU2_FIO2,FU2_CXR,FU2_CT,FU2_WBC,FU2_ANC,FU2_ALC,FU2_PLT,FU2_CRP,FU2_LDH,FU2_DD,FU3_BT,FU3_SBP,FU3_DBP,FU3_PR,FU3_RR,FU3_SPO2,FU3_FIO2,FU3_CXR,FU3_CT,FU3_WBC,FU3_ANC,FU3_ALC,FU3_PLT,FU3_CRP,FU3_LDH,FU3_DD,FU4_BT,FU4_SBP,FU4_DBP,FU4_PR,FU4_RR,FU4_SPO2,FU4_FIO2,FU4_CXR,FU4_CT,FU4_WBC,FU4_ANC,FU4_ALC,FU4_PLT,FU4_CRP,FU4_LDH,FU4_DD,Last_BT,Last_SBP,Last_DBP,Last_PR,Last_RR,Last_SPO2,Last_FIO2,Last_CXR,Last_CT,Last_WBC,Last_ANC,Last_ALC,Last_PLT,Last_CRP,Last_LDH,Last_DD,steroid,O2sup,ventilator,ECMO,ICU_date,Mortality,discharge_date,Mild,Moderate,Severe,Initial_PCR,FU1_PCR,FU2_PCR,FU3_PCR,FU4_PCR,Last_PCR,TX_0,TX_1,TX_2,TX_3,TX_4,Smoking_0,Smoking_1,Smoking_2,Smoking_3
0,CO,200001,CNU,20,0,-3.0,0.0,0.0,2020-02-21,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,37.4,125.0,77.0,100.0,20.0,98.0,,0.0,,3810.0,2280.0,1140.0,163000.0,0.30,,,37.6,110.0,68.0,78.0,20.0,99.0,,1.0,,3900.0,2000.0,1400.0,76000.0,0.30,,,36.2,91.0,54.0,76.0,20.0,100.0,,1.0,1.0,3600.0,1800.0,1500.0,148000.0,0.30,,,36.6,102.0,55.0,56.0,18.0,99.0,,1.0,,3820.0,1960.0,1500.0,199000.0,0.40,,,36.2,97.0,44.0,70.0,16.0,99.0,,1.0,,4390.0,2240.0,1780.0,249000.0,0.30,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,12.0,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1
1,CO,200002,CNU,65,0,-5.0,0.0,0.0,2020-02-22,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,37.4,163.0,76.0,76.0,20.0,99.0,,0.0,1.0,4800.0,2900.0,1400.0,163000.0,0.30,,,37.2,106.0,67.0,79.0,20.0,99.0,,1.0,,7100.0,3500.0,2800.0,170000.0,0.30,453.0,,38.0,121.0,66.0,79.0,18.0,96.0,,1.0,,9630.0,6750.0,1820.0,162000.0,3.80,394.0,,37.0,112.0,66.0,60.0,20.0,97.0,,1.0,1.0,9200.0,6300.0,1800.0,200000.0,3.80,432.0,,37.1,118.0,66.0,63.0,18.0,98.0,,1.0,,8000.0,5200.0,1700.0,295000.0,0.60,420.0,,36.9,121.0,68.0,69.0,18.0,97.0,,1.0,,8600.0,5700.0,2300.0,268000.0,0.60,375.0,,0.0,,,,,0.0,25.0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,0,1
2,CO,200003,CNU,65,1,0.0,0.0,0.0,2020-02-22,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,37.0,146.0,82.0,78.0,20.0,98.0,,0.0,1.0,3900.0,2200.0,900.0,263000.0,0.30,,,36.0,126.0,71.0,63.0,20.0,98.0,,0.0,,4500.0,2300.0,1700.0,208000.0,0.30,,,37.8,121.0,67.0,60.0,18.0,97.0,,0.0,,6800.0,2800.0,3000.0,139000.0,0.80,303.0,,36.9,132.0,74.0,62.0,18.0,96.0,,0.0,,5270.0,2740.0,1810.0,268000.0,2.90,300.0,,36.8,130.0,88.0,75.0,18.0,95.0,,0.0,1.0,5200.0,2900.0,1600.0,368000.0,0.80,303.0,,36.8,145.0,89.0,88.0,18.0,97.0,,0.0,,5400.0,3200.0,1600.0,425000.0,0.70,289.0,,0.0,,,,,0.0,25.0,1,0,0,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0
3,CO,200004,CNU,65,1,,0.0,0.0,2020-02-21,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,36.8,131.0,72.0,70.0,20.0,100.0,2.0,2.0,,5420.0,3410.0,1370.0,144000.0,4.80,,,40.1,125.0,95.0,96.0,17.0,91.0,7.56,2.0,,10800.0,9300.0,900.0,153000.0,20.10,856.0,1.1,35.4,143.0,90.0,109.0,17.0,97.0,5.26,2.0,,9350.0,7960.0,790.0,112000.0,22.40,900.0,1.00,35.7,145.0,86.0,65.0,18.0,99.0,5.44,2.0,,11120.0,9210.0,880.0,49000.0,20.80,939.0,,34.8,121.0,70.0,48.0,13.0,100.0,4.09,2.0,,6700.0,5400.0,500.0,34000.0,35.10,897.0,,37.7,92.0,54.0,118.0,21.0,100.0,11.60,2.0,2.0,11040.0,9740.0,540.0,159000.0,34.40,576.0,1.11,0.0,3.0,6.0,6,4,0.0,68.0,0,1,1,1,1,1,1,1,0,0,0,1,0,0,1,0,0,0
4,CO,200005,CNU,60,1,0.0,0.0,0.0,2020-02-18,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,36.7,121.0,72.0,70.0,18.0,90.0,2.0,2.0,2.0,2800.0,2100.0,600.0,97000.0,12.40,,,37.1,118.0,64.0,50.0,14.0,100.0,8.80,2.0,,3710.0,3050.0,390.0,117000.0,26.50,633.0,,36.5,113.0,88.0,52.0,15.0,95.0,4.10,2.0,,3100.0,2730.0,210.0,66000.0,18.70,586.0,0.78,37.6,105.0,86.0,71.0,12.0,94.0,4.69,2.0,,3510.0,2940.0,290.0,75000.0,12.00,499.0,,37.6,116.0,66.0,101.0,13.0,100.0,5.59,2.0,,2430.0,2000.0,260.0,50000.0,12.00,493.0,,37.1,95.0,58.0,74.0,21.0,100.0,8.06,1.0,1.0,9610.0,7540.0,920.0,139000.0,6.80,372.0,,0.0,6.0,7.0,10,6,0.0,71.0,0,1,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9194,CO,224031,SMC,70,1,-5.0,0.0,0.0,2020-12-29,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,36.7,168.0,146.0,88.0,27.0,88.0,60.0,,,7880.0,7080.0,510.0,151000.0,12.39,1119.0,124.00,36.0,101.0,65.0,66.0,25.0,88.0,90.00,,,11660.0,10630.0,700.0,163000.0,21.73,417.0,,36.4,105.0,69.0,56.0,24.0,98.0,,,,7510.0,6710.0,430.0,152000.0,2.21,,19.19,37.2,87.0,52.0,58,18.0,99.0,,,,9810.0,9250.0,310.0,156000.0,,,3.9,37.5,97.0,54.0,61.0,18.0,97.0,,,,12580.0,11020.0,770.0,212000.0,,346.0,,36.6,77.0,58.0,98.0,26.0,100.0,,,,18630.0,16110.0,1420.0,200000.0,,646.0,,1.0,0,3.0,,,1.0,35.0,0,1,1,1,1,1,1,1,1,0,1,0,0,1,1,0,0,0
9195,CO,224032,SMC,65,1,-15.0,-11.0,0.0,2020-12-23,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.4,96.0,58.0,79.0,19.0,94.0,90.0,,,12280.0,11680.0,320.0,262000.0,1.09,,14.96,36.2,99.0,66.0,68.0,15.0,96.0,60.00,,,,,,,,,,36.6,80.0,52.0,59.0,19.0,93.0,50.00,,,,,,,,,,36.5,118.0,72.0,72,18.0,95.0,60.00,,,11930.0,11300.0,440.0,219000.0,0.16,,,36.6,118.0,66.0,81.0,18.0,99.0,50.00,,,11810.0,11010.0,470.0,160000.0,,,,36.2,117.0,66.0,80.0,18.0,91.0,1.00,,,,,,,,,,1.0,0,,,,0.0,24.0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0
9196,CO,224033,SMC,80,0,,-1.0,0.0,2020-12-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.4,168.0,84.0,88.0,20.0,95.0,1.0,,,3340.0,1430.0,1490.0,140000.0,0.90,302.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.1,127.0,8.0,79.0,17.0,97.0,1.00,,,3740.0,2270.0,1180.0,129000.0,2.43,243.0,,,0,,,,0.0,3.0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0
9197,CO,224034,SMC,15,0,-35.0,-1.0,0.0,2020-12-20,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,36.1,123.0,74.0,91.0,24.0,98.0,,,,5100.0,4310.0,470.0,194000.0,1.27,471.0,,36.3,138.0,81.0,90.0,18.0,100.0,1.00,,,1980.0,770.0,930.0,163000.0,1.27,471.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.3,131.0,86.0,113.0,18.0,96.0,1.00,,,2390.0,980.0,1060.0,187000.0,18700.00,,,,4,,,,0.0,5.0,0,1,0,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0


In [5]:
#Remove outliers from df (dx_date < -15 days and < 1 days)
df1 = df[(df['dx_date'] >= -15) & (df['dx_date'] <= 1)]
#Add new empty columns to store resulting values to the dataframe
for item in Combined:
    df1[item] = np.NaN

In [6]:
df1_transformed = df_transformation(df1, 'dx_date', Combined, InitialCol, FU1Col, FU2Col, FU3Col, FU4Col, LastCol)
df1_transformed, idx = bloodCellTransform(df1_transformed, bloodCellCol)

#Drop unnecessary columns in new_df after combining data into one
new_df1 = df1_transformed.drop(columns = ['O2sup','FIO2','Initial_CT','FU1_CT','FU2_CT',
                                         'FU3_CT','FU4_CT','Last_CT']+InitialCol+FU1Col+FU2Col+
                                          FU3Col+FU4Col+LastCol)

In [7]:
#Include all columns
#Drop all null values under subset of Combined clinical and lab data
new_df1_ALL = new_df1.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP', 'LDH', 'DD'])

#Remove D-Dimer column only
#Drop all null values under subset of Combined clinical and lab data
new_df1_DD_removed = new_df1.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP', 'LDH'])

#Remove D-Dimer and LDH columns
#Drop all null values under subset of Combined clinical and lab data
new_df1_DD_LDH_removed = new_df1.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP'])

print(new_df1_ALL['Severe'].value_counts())
print(new_df1_DD_removed['Severe'].value_counts())
print(new_df1_DD_LDH_removed['Severe'].value_counts())

0    3002
1     700
Name: Severe, dtype: int64
0    5309
1     901
Name: Severe, dtype: int64
0    6125
1    1090
Name: Severe, dtype: int64


In [8]:
df2_transformed = df_transformation_initial(df1, 'dx_date', Combined, InitialCol, FU1Col, FU2Col, FU3Col, FU4Col, LastCol)
# df2_transformed, idx = bloodCellTransform(df2_transformed, bloodCellCol)

#Drop unnecessary columns in new_df after combining data into one
new_df2 = df2_transformed.drop(columns = ['O2sup','FIO2','Initial_CT','FU1_CT','FU2_CT',
                                         'FU3_CT','FU4_CT','Last_CT']+InitialCol+FU1Col+FU2Col+
                                          FU3Col+FU4Col+LastCol)

In [9]:
#Include all columns
#Drop all null values under subset of Combined clinical and lab data
new_df2_ALL = new_df2.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP', 'LDH', 'DD'])

#Remove D-Dimer column only
#Drop all null values under subset of Combined clinical and lab data
new_df2_DD_removed = new_df2.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP', 'LDH'])

#Remove D-Dimer and LDH columns
#Drop all null values under subset of Combined clinical and lab data
new_df2_DD_LDH_removed = new_df2.dropna(axis=0, how='any', subset=['SPO2', 'BT', 'SBP', 'DBP', 'PR', 'RR','WBC', 'ANC', 'ALC','PLT', 'CRP'])

print(new_df2_ALL['Severe'].value_counts())
print(new_df2_DD_removed['Severe'].value_counts())
print(new_df2_DD_LDH_removed['Severe'].value_counts())

0    2789
1     621
Name: Severe, dtype: int64
0    5106
1     839
Name: Severe, dtype: int64
0    5966
1    1066
Name: Severe, dtype: int64


In [23]:
new_df2_DD_removed[DefOutcome]

Unnamed: 0,ventilator,ECMO,ICU_date,Mortality
11,7.0,,,0.0
12,,,,0.0
13,,,,0.0
14,,,,0.0
15,3.0,3,3,0.0
...,...,...,...,...
9193,,,,0.0
9194,3.0,,,1.0
9196,,,,0.0
9197,,,,0.0


In [41]:
count = len(new_df2_DD_removed[(new_df2_DD_removed['ventilator'].notnull()) & 
                               (new_df2_DD_removed['ECMO'].isnull()) & 
                               (new_df2_DD_removed['ICU_date'].isnull()) & 
                               (new_df2_DD_removed['Mortality'] == 0.0)])
print("Number of people selected only through ECMO not null:", count)


Number of people selected only through ECMO not null: 69


In [40]:
new_df2_DD_removed[(new_df2_DD_removed['ventilator'].notnull()) & 
                               (new_df2_DD_removed['ECMO'].isnull()) & 
                               (new_df2_DD_removed['ICU_date'].isnull()) & 
                               (new_df2_DD_removed['Mortality'] == 0.0)]

Unnamed: 0,diagnosis,No,ID,age,sex,symptom_date,dx_date,adm_date,hospitalized_date,UD_HT,UD_DM,UD_CVD,UD_cancer,UD_other,SMT_fever,SMT_cough,SMT_sputum,SMT_dyspnea,SMT_myalgia,SMT_sorethroat,SMT_mental,SMT_GI,steroid,ventilator,ECMO,ICU_date,Mortality,discharge_date,Mild,Moderate,Severe,TX_0,TX_1,TX_2,TX_3,TX_4,Smoking_0,Smoking_1,Smoking_2,Smoking_3,BT,SBP,DBP,PR,RR,SPO2,CXR,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR
11,CO,200012,CNU,35,1,-4.0,0.0,0.0,2020-02-26,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,7.0,,,0.0,29.0,0,1,1,0,0,1,0,0,1,0,0,0,37.7,120.0,88.0,94.0,18.0,97.0,1.0,6560.0,4410.0,1110.0,228000.0,2.3,340.0,,1.0
22,CO,200023,CNU,60,0,-12.0,0.0,0.0,2020-02-26,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,10.0,,,0.0,50.0,0,1,1,0,0,0,0,0,0,0,0,1,37.6,104.0,62.0,74.0,24.0,100.0,2.0,9020.0,6440.0,1930.0,258000.0,11.3,454.0,,1.0
75,CO,200076,CNU,60,0,-5.0,0.0,0.0,2020-06-15,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,,,0.0,37.0,0,1,1,0,0,0,0,1,0,0,0,1,37.4,141.0,84.0,78.0,20.0,98.0,2.0,4040.0,3420.0,510.0,188000.0,12.0,439.0,,1.0
78,CO,200079,CNU,55,1,-1.0,0.0,0.0,2020-06-16,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,,,0.0,28.0,0,1,1,0,0,0,0,1,1,0,0,0,39.0,107.0,59.0,85.0,18.0,95.0,1.0,6250.0,4020.0,1600.0,222000.0,2.1,432.0,0.06,1.0
92,CO,200093,CNU,70,0,-2.0,0.0,0.0,2020-06-17,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,,,0.0,35.0,0,1,1,0,0,0,0,1,0,0,0,1,37.8,138.0,80.0,72.0,19.0,96.0,1.0,5590.0,3640.0,1580.0,211000.0,8.4,812.0,0.16,1.0
100,CO,200101,CNU,70,0,-9.0,0.0,0.0,2020-06-20,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,,0.0,41.0,0,1,1,0,0,0,0,1,1,0,0,0,36.9,132.0,68.0,89.0,25.0,89.0,2.0,6920.0,5880.0,670.0,187000.0,11.3,1027.0,,1.0
106,CO,200107,CNU,55,1,-5.0,0.0,0.0,2020-06-22,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,,,0.0,22.0,0,1,1,0,0,0,0,1,1,0,0,0,36.7,117.0,96.0,110.0,18.0,95.0,2.0,5800.0,4100.0,800.0,90000.0,6.6,774.0,0.24,1.0
140,CO,200141,CNU,70,1,-5.0,0.0,0.0,2020-07-04,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6.0,,,0.0,37.0,0,1,1,0,0,0,0,1,0,0,1,0,37.0,150.0,80.0,74.0,20.0,99.0,1.0,5230.0,3500.0,1100.0,130000.0,11.1,432.0,0.34,1.0
164,CO,200165,CNU,60,1,-8.0,0.0,0.0,2020-07-13,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,,,0.0,28.0,0,1,1,0,0,0,0,1,1,0,0,0,36.7,132.0,75.0,110.0,22.0,97.0,2.0,10800.0,10200.0,200.0,198000.0,24.8,742.0,0.35,1.0
167,CO,200168,CNU,60,1,-3.0,0.0,0.0,2020-07-14,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5.0,,,0.0,23.0,0,1,1,0,0,0,0,1,1,0,0,0,38.5,125.0,87.0,103.0,18.0,96.0,2.0,6800.0,4600.0,1400.0,87000.0,12.3,438.0,0.16,1.0


In [45]:
new_df2_DD_removed[(new_df2_DD_removed['ventilator'].isnull()) & 
                               (new_df2_DD_removed['ECMO'].notnull()) & 
                               (new_df2_DD_removed['ICU_date'].isnull()) & 
                               (new_df2_DD_removed['Mortality'] == 0.0)]

Unnamed: 0,diagnosis,No,ID,age,sex,symptom_date,dx_date,adm_date,hospitalized_date,UD_HT,UD_DM,UD_CVD,UD_cancer,UD_other,SMT_fever,SMT_cough,SMT_sputum,SMT_dyspnea,SMT_myalgia,SMT_sorethroat,SMT_mental,SMT_GI,steroid,ventilator,ECMO,ICU_date,Mortality,discharge_date,Mild,Moderate,Severe,TX_0,TX_1,TX_2,TX_3,TX_4,Smoking_0,Smoking_1,Smoking_2,Smoking_3,BT,SBP,DBP,PR,RR,SPO2,CXR,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR


In [44]:
new_df2_DD_removed[(new_df2_DD_removed['ventilator'].notnull()) & 
                               (new_df2_DD_removed['ECMO'].notnull()) & 
                               (new_df2_DD_removed['ICU_date'].isnull()) & 
                               (new_df2_DD_removed['Mortality'] == 0.0)]

Unnamed: 0,diagnosis,No,ID,age,sex,symptom_date,dx_date,adm_date,hospitalized_date,UD_HT,UD_DM,UD_CVD,UD_cancer,UD_other,SMT_fever,SMT_cough,SMT_sputum,SMT_dyspnea,SMT_myalgia,SMT_sorethroat,SMT_mental,SMT_GI,steroid,ventilator,ECMO,ICU_date,Mortality,discharge_date,Mild,Moderate,Severe,TX_0,TX_1,TX_2,TX_3,TX_4,Smoking_0,Smoking_1,Smoking_2,Smoking_3,BT,SBP,DBP,PR,RR,SPO2,CXR,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR
48,CO,200049,CNU,40,1,-1.0,0.0,0.0,2020-03-18,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,13,,0.0,47.0,0,1,1,0,0,0,1,1,0,0,1,0,39.5,150.0,79.0,90.0,22.0,96.0,1.0,4200.0,2710.0,1230.0,133000.0,5.2,559.0,0.17,1.0
2394,CO,210428,BRH,70,0,-1.0,0.0,0.0,2020-06-02,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,6.0,Positive,,0.0,27.0,0,1,1,0,0,1,0,1,1,0,0,0,37.4,138.0,82.0,88.0,18.0,98.0,1.0,5910.0,3469.17,1672.53,187000.0,2.68,221.0,0.46,1.0
3563,CO,210210,BRH,60,1,-2.0,-1.0,0.0,2020-02-25,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,8.0,8,,0.0,28.0,0,1,1,0,0,1,0,1,1,0,0,0,36.8,133.0,90.0,102.0,20.0,98.0,0.0,3980.0,2054.0,1429.0,115000.0,1.07,184.0,,1.0
9181,CO,224018,SMC,65,0,-12.0,-7.0,0.0,2020-12-17,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,26.0,,0.0,60.0,0,1,1,0,0,0,0,1,1,0,0,0,37.6,137.0,83.0,87.0,24.0,93.0,,17420.0,15330.0,1050.0,165000.0,6.28,518.0,0.59,1.0


In [48]:
new_df2_DD_removed[new_df2_DD_removed.Severe==1]

Unnamed: 0,diagnosis,No,ID,age,sex,symptom_date,dx_date,adm_date,hospitalized_date,UD_HT,UD_DM,UD_CVD,UD_cancer,UD_other,SMT_fever,SMT_cough,SMT_sputum,SMT_dyspnea,SMT_myalgia,SMT_sorethroat,SMT_mental,SMT_GI,steroid,ventilator,ECMO,ICU_date,Mortality,discharge_date,Mild,Moderate,Severe,TX_0,TX_1,TX_2,TX_3,TX_4,Smoking_0,Smoking_1,Smoking_2,Smoking_3,BT,SBP,DBP,PR,RR,SPO2,CXR,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR
11,CO,200012,CNU,35,1,-4.0,0.0,0.0,2020-02-26,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,7.0,,,0.0,29.0,0,1,1,0,0,1,0,0,1,0,0,0,37.7,120.0,88.0,94.0,18.0,97.0,1.0,6560.0,4410.0,1110.0,228000.0,2.30,340.0,,1.0
15,CO,200016,CNU,60,0,-7.0,0.0,0.0,2020-02-26,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,3,3,0.0,31.0,0,1,1,0,0,1,0,0,0,0,0,1,38.4,110.0,96.0,81.0,26.0,95.0,2.0,3080.0,2120.0,740.0,121000.0,7.40,648.0,,1.0
22,CO,200023,CNU,60,0,-12.0,0.0,0.0,2020-02-26,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,10.0,,,0.0,50.0,0,1,1,0,0,0,0,0,0,0,0,1,37.6,104.0,62.0,74.0,24.0,100.0,2.0,9020.0,6440.0,1930.0,258000.0,11.30,454.0,,1.0
25,CO,200026,CNU,75,0,-7.0,0.0,0.0,2020-03-10,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1,1,1.0,53.0,0,1,1,0,0,1,0,0,0,0,0,1,36.5,89.0,53.0,80.0,30.0,100.0,2.0,11760.0,10430.0,950.0,269000.0,8.40,1461.0,3.00,1.0
48,CO,200049,CNU,40,1,-1.0,0.0,0.0,2020-03-18,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,13,,0.0,47.0,0,1,1,0,0,0,1,1,0,0,1,0,39.5,150.0,79.0,90.0,22.0,96.0,1.0,4200.0,2710.0,1230.0,133000.0,5.20,559.0,0.17,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9170,CO,224007,SMC,65,1,0.0,-12.0,0.0,2020-12-26,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,,,0.0,95.0,0,1,1,0,0,0,0,1,0,1,0,0,36.8,135.0,57.0,89.0,28.0,82.0,,7630.0,7240.0,230.0,179000.0,16.88,729.0,22.74,1.0
9181,CO,224018,SMC,65,0,-12.0,-7.0,0.0,2020-12-17,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,26.0,,0.0,60.0,0,1,1,0,0,0,0,1,1,0,0,0,37.6,137.0,83.0,87.0,24.0,93.0,,17420.0,15330.0,1050.0,165000.0,6.28,518.0,0.59,1.0
9188,CO,224025,SMC,75,1,-6.0,-2.0,0.0,2021-01-03,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,,0.0,,,0.0,60.0,0,1,1,0,1,0,0,1,0,0,0,1,36.3,132.0,81.0,98.0,20.0,90.0,,3770.0,3510.0,190.0,208000.0,17.38,632.0,15.45,1.0
9192,CO,224029,SMC,70,0,-11.0,0.0,0.0,2021-01-02,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,,0.0,58.0,0,1,1,0,0,0,0,1,0,0,0,1,36.4,100.0,59.0,58.0,18.0,99.0,,7860.0,7230.0,470.0,314000.0,13.84,451.0,4.27,1.0


In [37]:
count = len(new_df2_DD_removed[(new_df2_DD_removed['ventilator'].isnull()) & 
                               (new_df2_DD_removed['ECMO'].isnull()) & 
                               (new_df2_DD_removed['ICU_date'].notnull()) & 
                               (new_df2_DD_removed['Mortality'] == 0.0)])
print("Number of people selected only through ventilator not null:", count)

Number of people selected only through ventilator not null: 265


In [38]:
count = len(new_df2_DD_removed[(new_df2_DD_removed['ventilator'].isnull()) & 
                               (new_df2_DD_removed['ECMO'].isnull()) & 
                               (new_df2_DD_removed['ICU_date'].isnull()) & 
                               (new_df2_DD_removed['Mortality'] == 1.0)])
print("Number of people selected only through ventilator not null:", count)

Number of people selected only through ventilator not null: 89


In [20]:
new_df2_DD_removed.Mortality.value_counts()

0.0    5355
1.0     326
Name: Mortality, dtype: int64

### Conduct hospital by hospital analysis

In [7]:
new_df1[columnsToView].groupby('ID').count().sort_values('No')

Unnamed: 0_level_0,No,age,symptom_date,dx_date,BT,SBP,DBP,PR,RR,SPO2,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR,Mild,Moderate,Severe
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KWU,42,42,37,42,42,42,42,42,42,42,42,42,42,42,42,21,34,42,42,42,42
DCM,127,127,119,127,127,127,127,104,101,126,127,127,127,127,127,115,24,127,127,127,127
YNU,129,129,129,129,0,0,0,0,0,129,0,129,129,129,124,123,39,129,129,129,129
SCH,209,209,180,209,162,162,162,162,162,156,163,201,201,200,201,201,6,209,209,209,209
AUH,290,290,231,290,290,290,290,288,285,277,287,275,287,283,271,202,192,290,290,290,290
CAU,290,290,262,290,290,290,290,290,290,290,290,290,290,289,290,290,274,290,290,290,290
BSH,294,294,257,294,294,288,288,294,294,290,293,293,293,293,289,40,188,294,294,294,294
JBU,296,296,245,296,246,241,241,246,245,290,238,288,288,288,282,257,177,296,296,296,296
BRH,296,296,293,296,296,296,296,296,296,296,296,295,296,295,296,291,170,296,296,296,296
SMC,331,331,286,331,331,331,331,329,330,329,326,326,326,326,321,320,240,331,331,331,331


In [8]:
new_df1[columnsToView].groupby('ID').mean()

Unnamed: 0_level_0,No,age,symptom_date,dx_date,BT,SBP,DBP,PR,RR,SPO2,WBC,ANC,ALC,PLT,CRP,LDH,DD,PCR,Mild,Moderate,Severe
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AUH,223819.575862,66.741379,-5.08658,-2.2,36.838966,128.003448,71.327586,83.104167,20.375439,96.480144,9178.04878,6693.992727,960.222997,191226.14841,8.246716,374.019802,4.096818,0.975862,0.262069,0.737931,0.293103
BRH,210356.040541,46.030405,-2.641638,-0.371622,36.937162,137.425676,85.628378,87.888514,18.827703,97.013514,4952.118243,2998.56722,1464.120541,210664.40678,1.400304,219.666667,0.811412,0.851351,0.841216,0.158784,0.02027
BSH,217334.12585,59.795918,-5.077821,-2.540816,37.155102,130.930556,76.111111,91.0,21.42517,95.531034,8087.406143,6592.710887,863.289352,212952.21843,7.487578,398.35,3.13584,0.989796,0.391156,0.608844,0.289116
CAU,221817.303448,61.675862,-6.625954,-2.889655,36.888276,133.017241,75.903448,85.886207,21.575862,97.296552,8264.724138,6319.413793,1160.5,200972.66436,8.783207,414.406897,9.428029,1.0,0.117241,0.882759,0.589655
CBU,217863.618619,60.405405,-5.457014,-3.807808,36.998187,133.302115,74.045317,84.743202,21.376147,95.018462,6797.765244,6173.250646,1892.270554,199048.318043,7.000399,624.823529,3.612459,0.960961,0.384384,0.615616,0.183183
CNU,205258.247449,53.80102,-4.722045,-0.55102,37.297194,132.461735,82.602041,84.701531,19.293367,97.145408,5252.653061,3484.540816,1264.107143,204234.693878,2.735204,422.565891,0.32939,0.961735,0.778061,0.221939,0.109694
DCM,211423.401575,58.622047,-7.680672,-2.96063,36.961417,126.834646,77.771654,81.701923,20.49505,96.595238,6016.220472,4161.496063,1246.614173,221480.314961,27.219134,312.513043,4.167917,1.0,0.527559,0.472441,0.125984
GIL,215417.245387,56.725092,-4.748219,-1.876384,37.071033,127.667897,78.404059,78.066421,20.042435,96.450185,5585.166974,3828.239852,1241.998152,207432.749077,4.519258,330.127863,2.105208,1.0,0.588561,0.411439,0.092251
JBU,216670.85473,52.314189,-4.902041,-2.679054,37.413008,132.456432,78.958506,83.609756,20.62449,97.617241,5478.319328,3934.791667,1094.895833,196052.083333,4.945486,594.023346,1.296439,1.0,0.530405,0.469595,0.233108
JNU,215081.436503,49.707029,-2.832742,-1.189014,36.717011,130.08848,81.682898,86.33156,19.316194,96.557612,5378.077446,3285.236704,1551.623669,216516.28415,2.052992,431.269964,3.092988,0.992321,0.794448,0.205552,0.052569


### Save the best performing df to new csv files

In [9]:
new_df1.to_csv("../Data/Preprocessed/CRF_Optimal.csv")
new_df1_ALL.to_csv("../Data/Preprocessed/CRF_ALL_included.csv")
new_df1_DD_removed.to_csv("../Data/Preprocessed/CRF_DD_removed.csv")
new_df1_DD_LDH_removed.to_csv("../Data/Preprocessed/CRF_DDLDH_removed.csv")

In [24]:
new_df2_DD_removed.to_csv("../Data/Preprocessed/CRF_Initial.csv")