# Project 2: Covid

In [110]:
# Import
%matplotlib inline

import os
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Literature

In 2019, the first COVID-19 cases are observed in China. Rapidly, the SARS-Cov2 virus spread worldwide, pushing governments to take strict decisions about the lives of their co-citizens, like containment, to protect the population. Indeed, in some cases, COVID-19 patients ended up in intensive care services and sometimes died.

**The aim of our model is, based on easily computable parameters at the study's beginning, to predict whether the patient will be likely to die or if the chance of survival is important.** The point of this study is to help the hospital organise in the case of a high number of cases.


The studied dataset stem from the IDDO Data Repository of COVID-19 data. This data was pulled from the underlying data collection projects on 2022-09-01. The data comes from 1,200 institutions from over 45 countries and gather various information from 700,000 hospitalised individuals.

To keep only the relevant features, we first dive into the literature, using Meta-analysis papers. First, we have been looking for aggravating factors that will likely lead the patient to ICU.

Obesity: according to a meta-analysis by Sales-Peres, there is a correlation between obesity and ICU admission. This paper also concluded that co-morbidities for obese patients, such as hypertension, type 2 diabetes, smoking habit, lung disease, and/or cardiovascular disease lead to a higher chance of ICU admission.
Age: patients aged 70 years and above have a higher risk of infection and a higher need for intensive care than patients younger than 70.
Sex: men, when infected, have a higher risk of severe COVID-19 disease and a higher need for intensive care than women\cite{pijls_demographic_2021}.
Ethnicity: the risk of contamination was higher in most ethnic minority groups than their White counterparts in North America and Europe. Among people with confirmed infection, African-Americans and Hispanic Americans were also more likely than White Americans to be hospitalised with SARS-CoV-2 infection. However, the probability of ICU admission was equivalent for all groups. Thus, ethnicity is not relevant to our question. 
Blood tests: Patients with increased pancreatic enzymes, including elevated serum lipase or amylase of either type, had worse clinical outcomes. Lower levels of lymphocytes and hemoglobin; elevated levels of leukocytes, aspartate aminotransferase, alanine aminotransferase, blood creatinine, blood urea nitrogen, high-sensitivity troponin, creatine kinase, high-sensitivity C-reactive protein, interleukin 6, D-dimer, ferritin, lactate dehydrogenase, and procalcitonin; and a high erythrocyte sedimentation rate were also associated with severe COVID-19.  

Out of a total of 3009 citations, 17 articles (22 studies, 21 from China and one study from Singapore) with 3396 ranging from 12 to1099 patients were included. Our meta-analyses showed a significant decrease in lymphocyte, monocyte, and eosinophil, hemoglobin, platelet, albumin, serum sodium, lymphocyte to C-reactive protein ratio (LCR), leukocyte to C-reactive protein ratio (LeCR), leukocyte to IL-6 ratio (LeIR), and an increase in the neutrophil, alanine aminotransferase (ALT), aspartate aminotransferase (AST), total bilirubin, blood urea nitrogen (BUN), creatinine (Cr), erythrocyte Sedimentation Rate (ESR), C-reactive protein (CRP), Procalcitonin (PCT), lactate dehydrogenase (LDH), fibrinogen, prothrombin time (PT), D-dimer, glucose level, and neutrophil to lymphocyte ratio (NLR) in the severe group compared with the non-severe group. 

No significant changes in white blood cells (WBC), Creatine Kinase (CK), troponin I, myoglobin, IL-6 and K between the two groups were observed. 

## 2. Load data that come from data_selection file

In [111]:
# Open file
data_folder = './data/results/'
mylist = []
for chunk in pd.read_csv(data_folder + 'df_matrixAll_woTime_woNA.csv', sep=',', low_memory=False, chunksize=5000):
    mylist.append(chunk)
df_all = pd.concat(mylist, axis=0)
df_all.name = 'df_all'
del mylist

In [112]:
# Delete row where DSDECOD is NA
df_all = df_all[df_all.DSDECOD != np.nan]

## 3. Feature selection

In [None]:
# Save
df_selection.to_csv('./data/results/df_selection.csv')

## 3. Model 1: Logistic regression

Model that use method to fill NA => risk of bias because NA not at random.

### Basic logistic regression

In [None]:
# Perform Logistic regression

# Split dataset in features and target variable
X_reg = df.loc[:, df.columns != 'DSDECOD'].to_numpy()
y_reg = df['DSDECOD'].to_numpy()

# Fill NA values
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X_reg)

# Split X and y into training and testing sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=16)

In [None]:
# Create pipeline to standardize and make logistic regression
pipe_reg = Pipeline([('scl', StandardScaler()), 
                     ('clf', LogisticRegression())])

# Fit data into the model
pipe_reg.fit(X_reg_train, y_reg_train)

# Predicting values
y_reg_pred = pipe_reg.predict(X_reg_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_reg_pred, y_reg_test)
print('Accuracy score for logistic regression : ',accuracy_score)

In [None]:
# See coeffs of the model
pipe_reg.named_steps['clf'].coef_

### Play with logistic regression parameters

In [None]:
# Perform Logistic regression with cross-validation

# Split dataset in features and target variable
X_reg = df_reg.loc[:, df_reg.columns != 'DSDECOD'].to_numpy()
y_reg = df_reg['DSDECOD'].to_numpy()

# Fill NA values
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X_reg)

# Split X and y into training and testing sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=16)

# Create pipeline to standardize and make logistic regression
pipe_reg = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression())])

# Set parameters to test
param_reg = {'clf__loss': ['log'],
             'clf__penalty': [None, 'l1', 'l2', 'elasticnet'],
             'clf__alpha': np.linspace(0.15, 0.35),
             'clf__n_iter': [3, 5, 7]}

# Cross-validation
cv_reg = RandomizedSearchCV(estimator = pipe_reg, 
                                         param_distributions=param_reg, 
                                         cv=3, n_iter=30, n_jobs=-1)

# Fit data into the model
cv_reg.fit(X_reg_train, y_reg_train)

# Predicting values
y_reg_pred = cv_reg.predict(X_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_reg_pred, y_reg_test)
print('Accuracy score for logistic regression : ',accuracy_score)

In [None]:
# See best parameters of the model
g_search.best_params_

In [None]:
# See coeffs of the model
cv_reg.named_steps['clf'].coef_

## 4. Model 2: HistGradientBoostingClassifier

Estimators that allow NaN values for type classifier.

In [None]:
# Perform HistGradientBoostingClassifier

# Split dataset in features and target variable
X_hgbc = df.loc[:, df.columns != 'DSDECOD'].to_numpy()
y_hgbc = df['DSDECOD'].to_numpy()

# Split X and y into training and testing sets
X_hgbc_train, X_hgbc_test, y_hgbc_train, y_hgbc_test = train_test_split(X_hgbc, y_hgbc, test_size=0.3, random_state=16)

# Create pipeline to standardize and make logistic regression
pipe_hgbc = Pipeline([('scl', StandardScaler()), 
                     ('clf', HistGradientBoostingClassifier())])

# Fit data into the model
pipe_hgbc.fit(X_hgbc_train, y_hgbc_train)

# Predicting values
y_hgbc_pred = pipe_hgbc.predict(X_hgbc_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_hgbc_pred, y_hgbc_test)
print('Accuracy score for logistic regression : ',accuracy_score)

In [None]:
# See coeffs of the model
cv_hgbc.named_steps['clf'].coef_