In [128]:
# Import
%matplotlib inline

import os
import os.path as op
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Loading data

In [129]:
# Open file
data_folder = op.join(os.getcwd(), "data", "results")
mylist = []
for chunk in pd.read_csv(op.join(data_folder, 'df_final_I-DataSelection.csv'), sep=',', low_memory=False, chunksize=5000, index_col=0):
    mylist.append(chunk)
df = pd.concat(mylist, axis=0)
df.name = 'df'
del mylist

## 2. Stratification per country

In [130]:
continent_list = ['CONTINENT_AF', 'CONTINENT_AS', 'CONTINENT_EU','CONTINENT_NA', 'CONTINENT_OC', 'CONTINENT_SA']

In [131]:
#data afrique
df_africa = df[df['CONTINENT_AF'] == 1]
df_africa.drop(continent_list, axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_africa.drop(continent_list, axis=1, inplace=True)


In [132]:
#data asia
df_asia = df[df['CONTINENT_AS'] == 1]
df_asia.drop(continent_list, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_asia.drop(continent_list, axis=1, inplace=True)


In [133]:
#data europe
df_europe = df[df['CONTINENT_EU'] == 1]
df_europe.drop(continent_list, axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_europe.drop(continent_list, axis=1, inplace=True)


In [134]:
#data south america
df_south_america = df[df['CONTINENT_SA'] == 1]
df_south_america.drop(continent_list, axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_south_america.drop(continent_list, axis=1, inplace=True)


In [135]:
#data north america
df_north_america = df[df['CONTINENT_NA'] == 1]
df_north_america.drop(continent_list, axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_north_america.drop(continent_list, axis=1, inplace=True)


In [136]:
#data oceania 
df_oceania = df[df['CONTINENT_OC'] == 1]
df_oceania.drop(continent_list, axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oceania.drop(continent_list, axis=1, inplace=True)


In [137]:
#relative size of each slice of data
print(f"Data size AFRICA: {df_africa.size}")
print(f"Data size ASIA: {df_asia.size}")
print(f"Data size EUROPE: {df_europe.size}")
print(f"Data size NORTH AMERICA: {df_north_america.size}")
print(f"Data size SOUTH AMERICA: {df_south_america.size}")
print(f"Data size OCEANIA: {df_oceania.size}")

Data size AFRICA: 47985994
Data size ASIA: 772534
Data size EUROPE: 9024526
Data size NORTH AMERICA: 595252
Data size SOUTH AMERICA: 605640
Data size OCEANIA: 3332


In [138]:
print("The two bigger datasets are AFRICA and EUROPE. We are going to keep working with these lines")

The two bigger datasets are AFRICA and EUROPE. We are going to keep working with these lines


## 3. Forcing the dataset to have an equal number of death and survival

### 3.1 Check the outcome repartition in each dataset

In [139]:
#africa repartition
df_africa_death = df_africa[df_africa['DSDECOD']== 1]
df_africa_alive = df_africa[df_africa['DSDECOD']== 0]
print(f"AFRICA death: {100*df_africa_death.size/df_africa.size}%")
print(f"AFRICA survival: {100-100*df_africa_death.size/df_africa.size}%")

AFRICA death: 21.60448317481972%
AFRICA survival: 78.39551682518028%


In [140]:
#europe repartition
df_europe_death = df_europe[df_europe['DSDECOD']== 1]
df_europe_alive = df_europe[df_europe['DSDECOD']== 0]
print(f"EUROPE death: {100*df_europe_death.size/df_europe.size}%")
print(f"EUROPE survival: {100-100*df_europe_death.size/df_europe.size}%")

EUROPE death: 25.678977488679184%
EUROPE survival: 74.32102251132082%


The data is unbalanced. We would want to force the data repaprtion towards an 50-50% repartition

### 3.2 Forcing the balance

In [141]:
data_set_size = 20000

In [142]:
#africa
df_africa_death = df_africa_death.sample(frac = 1).iloc[0:int(data_set_size/2+1)]
df_africa_alive = df_africa_alive.sample(frac = 1).iloc[0:int(data_set_size/2+1)]
df_africa = df_africa_death.append(df_africa_alive, ignore_index = True)

  df_africa = df_africa_death.append(df_africa_alive, ignore_index = True)


In [143]:
#europe
df_europe_death = df_europe_death.sample(frac = 1).iloc[0:int(data_set_size/2)]
df_europe_alive = df_europe_alive.sample(frac = 1).iloc[0:int(data_set_size/2)]
df_europe = df_europe_death.append(df_europe_alive, ignore_index = True)

  df_europe = df_europe_death.append(df_europe_alive, ignore_index = True)


## 4. Feature selection

In [147]:
categorical = ['DSDECOD',
               'SEX',
               'HODECOD', 
               'IETEST_Acute_Respiratory_Infection', 'IETEST_Cough', 'IETEST_Covid_ICU', 'IETEST_Covid_admission', 'IETEST_Dyspnoea_Tachypnoea', 'IETEST_Fever', 
               'IETEST_Inflammatory_MultiSystem_Syndrome', 'IETEST_noCovid_ICU',               
               'INCLAS_VACCINES', 
               'MBTEST_ADENOVIRUS', 'MBTEST_BACTERIA','MBTEST_INFLUENZA', 'MBTEST_MB_SEVERE_ACUTE_RESP_SYND_CORONAVIRUS', 'MBTEST_OTHER PATHOGENS', 
               'MBTEST_OTHER RESPIRATORY PATHOGENS', 'MBTEST_RSV', 
               'RPSTRESC', 
               'RSCAT_AVPU', 
               'SACAT_COMORBIDITIES', 'SACAT_COMPLICATIONS', 'SACAT_PREVIOUS_COVID-19_INFECTION', 
               'SCTEST_Appropriate_Developmental_Age_Indicator', 'SCTEST_Breast_Fed_Indicator', 'SCTEST_Infant_Less_Than_One_Year_Indicator', 
               'SCTEST_Premature_Birth_Indicator']

In [148]:
# First for some columns in particular, it is more "logical" to put 0 (baseline) to fill NA

NA_to_0 = ['HODECOD', 
           'IETEST_Acute_Respiratory_Infection', 'IETEST_Cough', 'IETEST_Covid_ICU', 'IETEST_Covid_admission', 'IETEST_Dyspnoea_Tachypnoea', 'IETEST_Fever', 
           'IETEST_Inflammatory_MultiSystem_Syndrome', 'IETEST_noCovid_ICU',  
           'INCLAS_VACCINES', 
           'MBTEST_ADENOVIRUS', 'MBTEST_BACTERIA','MBTEST_INFLUENZA', 'MBTEST_MB_SEVERE_ACUTE_RESP_SYND_CORONAVIRUS', 'MBTEST_OTHER PATHOGENS', 
           'MBTEST_OTHER RESPIRATORY PATHOGENS', 'MBTEST_RSV', 
           'RPSTRESC', 
           'RSCAT_AVPU', 
           'SACAT_COMORBIDITIES', 'SACAT_COMPLICATIONS']

In [161]:
# Fill NA and standardize

def preProcess(df_train, df_test, categorical, NA_to_0) :
    
    # 1. Replace NA by 0 in NA_to_0 columns
    
    df_train[df_train.columns[df_train.columns.isin(NA_to_0)]] = df_train[df_train.columns[df_train.columns.isin(NA_to_0)]].fillna(0)
    df_test[df_test.columns[df_test.columns.isin(NA_to_0)]] = df_test[df_test.columns[df_test.columns.isin(NA_to_0)]].fillna(0)
    
    # 2. Separate in categorical and continuous columns
    
    df_train_cat = df_train[df_train.columns[df_train.columns.isin(categorical)]]
    df_train_con = df_train[df_train.columns[~df_train.columns.isin(categorical)]]
    df_test_cat = df_test[df_test.columns[df_test.columns.isin(categorical)]]
    df_test_con = df_test[df_test.columns[~df_test.columns.isin(categorical)]]
    
    cols_cat = df_train_cat.columns
    cols_con = df_train_con.columns
    print(cols_con)
    
    # 3. Fill the missing values 
        
        # For categorical variables
    imp_cat = SimpleImputer(strategy = "most_frequent")
    imp_cat = imp_cat.fit(df_train_cat)
    
    df_train_cat = imp_cat.transform(df_train_cat)
    df_test_cat = imp_cat.transform(df_test_cat)
    df_train_cat = pd.DataFrame(df_train_cat, columns=cols_cat)
    df_test_cat = pd.DataFrame(df_test_cat, columns=cols_cat)
    
        # For continuous variables
    imp_con = SimpleImputer(strategy = "median")
    imp_con = imp_con.fit(df_train_con)
    print(df_train_con.shape)
    df_train_con = imp_con.transform(df_train_con)
    print(np.shape(df_train_con))
    df_test_con = imp_con.transform(df_test_con)
    df_train_con = pd.DataFrame(df_train_con, columns=cols_con)
    df_test_con = pd.DataFrame(df_test_con, columns=cols_con)

    # 4. Standardization of continuous data
    print('hey3')
    scaler = StandardScaler()
    scaler = scaler.fit(df_train_con)
    print('hey4')
    df_train_con = scaler.transform(df_train_con)
    df_test_con = scaler.transform(df_test_con)
    df_train_con = pd.DataFrame(df_train_con, columns=cols_con)
    df_test_con = pd.DataFrame(df_test_con, columns=cols_con)
    
    # 5. Creation of X and y matrixes
    print('hey')
    df_train = pd.concat([df_train_cat, df_train_con], axis=1)
    df_test = pd.concat([df_test_cat, df_test_con], axis=1)
    print('nana')
    X_train = df_train.loc[:, df_train.columns!='DSDECOD']
    y_train = df_train['DSDECOD']
    X_test = df_test.loc[:, df_test.columns!='DSDECOD']
    y_test = df_test['DSDECOD']
    
    # 6. Get feature names
    print('iccii')
    feature_names = df_train.loc[:, df_train.columns!='DSDECOD'].columns
    print('laa')
    return X_train, y_train, X_test, y_test, feature_names

### 4.1 Without treatments

In [145]:
treatment_list = ['INCLAS_AGENTS_ACTING_ON_THE_RENIN-ANGIOTENSIN_SYSTEM', 'INCLAS_ANALGESICS', 'INCLAS_ANESTHETICS', 
               'INCLAS_ANTIBACTERIALS_FOR_SYSTEMIC_USE', 'INCLAS_ANTIHELMINTICS', 'INCLAS_ANTIINFLAMMATORY_AND_ANTIRHEUMATIC_PRODUCTS,_NON-STEROIDS',
               'INCLAS_ANTIMALARIALS', 'INCLAS_ANTIMYCOTICS_FOR_SYSTEMIC_USE', 'INCLAS_ANTITHROMBOTIC_AGENTS',
               'INCLAS_ANTIVIRALS_FOR_SYSTEMIC_USE', 'INCLAS_ARTIFICIAL_RESPIRATION', 'INCLAS_BETA_BLOCKING_AGENTS',
               'INCLAS_BLOOD_SUBSTITUTES_AND_PERFUSION_SOLUTIONS', 'INCLAS_BRONCHOSCOPY', 'INCLAS_CARDIAC_PACING',
               'INCLAS_CARDIAC_THERAPY', 'INCLAS_CARDIOPULMONARY_RESUSCITATION', 'INCLAS_CHEMOTHERAPY', 'INCLAS_CORTICOSTEROIDS_FOR_SYSTEMIC_USE',
               'INCLAS_DIURETICS', 'INCLAS_DRUGS_FOR_ACID_RELATED_DISORDERS', 'INCLAS_DRUGS_FOR_OBSTRUCTIVE_AIRWAY_DISEASES', 'INCLAS_EXTRACORPOREAL_MEMBRANE_OXYGENATION', 
               'INCLAS_HIGH_FLOW_OXYGEN_NASAL_CANNULA', 'INCLAS_IMMUNOGLOBULINS', 'INCLAS_IMMUNOSTIMULANTS', 'INCLAS_IMMUNOSUPPRESSANTS', 
               'INCLAS_INSERTION_OF_TRACHEOSTOMY_TUBE', 'INCLAS_INTUBATION', 'INCLAS_LIPID_MODIFYING_AGENTS', 'INCLAS_MUSCLE_RELAXANTS', 
               'INCLAS_NONINVASIVE_POSITIVE_PRESSURE_VENTILATION', 'INCLAS_NONINVASIVE_VENTILATION', 'INCLAS_OTHER_RESPIRATORY_SYSTEM_PRODUCTS', 
               'INCLAS_OXYGEN', 'INCLAS_PERCUTANEOUS_ENDOSCOPIC_GASTROSTOMY', 'INCLAS_PRONE_BODY_POSITION', 'INCLAS_PSYCHOLEPTICS', 
               'INCLAS_REMOVAL_OF_ENDOTRACHEAL_TUBE', 'INCLAS_RENAL_REPLACEMENT', 'INCLAS_REPLACEMENT_AGENT', 'INCLAS_TOTAL_PARENTERAL_NUTRITION', 
               'INCLAS_TRANSFUSION_OF_BLOOD_PRODUCT']

#### 4.1.1 Africa

In [146]:
df_africa.drop(treatment_list, axis=1, inplace=True)

In [149]:
df_africa_train, df_africa_test = train_test_split(df_africa, test_size=0.3, random_state=16) 

In [162]:
X_africa_train, y_africa_train, X_africa_test, y_afica_test, features = preProcess(df_africa_train, df_africa_test, categorical, NA_to_0) 

Index(['AGE', 'LBTEST_ALT', 'LBTEST_APTT', 'LBTEST_APTTSTND', 'LBTEST_AST',
       'LBTEST_BASEEXCS', 'LBTEST_BICARB', 'LBTEST_BILI', 'LBTEST_CD4',
       'LBTEST_CK', 'LBTEST_CREAT', 'LBTEST_CRP', 'LBTEST_FERRITIN',
       'LBTEST_GLUC', 'LBTEST_HCT', 'LBTEST_HGB', 'LBTEST_INR', 'LBTEST_K',
       'LBTEST_LACTICAC', 'LBTEST_LDH', 'LBTEST_LYM', 'LBTEST_NEUT',
       'LBTEST_PCT', 'LBTEST_PH', 'LBTEST_PLAT', 'LBTEST_PT', 'LBTEST_SODIUM',
       'LBTEST_UREAN', 'LBTEST_WBC', 'RSCAT_GCS_NINDS_VERSION',
       'VSTEST_CPLRFLT', 'VSTEST_DIABP', 'VSTEST_HEIGHT', 'VSTEST_HR',
       'VSTEST_MAP', 'VSTEST_OXYSAT', 'VSTEST_RESP', 'VSTEST_SYSBP',
       'VSTEST_TEMP', 'VSTEST_WEIGHT'],
      dtype='object')
(14001, 40)
(14001, 31)
hey1
hey2


ValueError: Shape of passed values is (14001, 31), indices imply (14001, 40)

In [163]:
# Fit data into the model
clf = LogisticRegression().fit(X_train, y_train)

# Predicting values
y_pred = clf.predict(X_test)

# Calculate performance scores
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)
roc_auc = roc_auc_score(y_pred, y_test)

# Print performance
print('Performance for Logistic regression:')
print('  - Accuracy score = {:.2f}'.format(accuracy))
print('  - F1 score = {:.2f}'.format(f1))
print('  - Precision score = {:.2f}'.format(precision))
print('  - Recall score = {:.2f}'.format(recall))
print('  - ROC AUC score = {:.2f}'.format(roc_auc))

# Add performance to df_results
df_results = df_results.append(pd.Series({"Model" : "Logistic regression", 
                                          "Accuracy" : accuracy,
                                          "F1" : f1,
                                          "Precision": precision,
                                          "Recall" : recall,
                                          "ROC AUC" : roc_auc}), ignore_index = True)

NameError: name 'X_train' is not defined