# Project 2: Covid

In [11]:
# Import
%matplotlib inline

import os
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Literature

- Résumé littérature
- Quel question on se pose

## 2. Load data that come from data_selection file

In [12]:
# Open file
data_folder = './data/results/'
mylist = []
for chunk in pd.read_csv(data_folder + 'df.csv', sep=',', low_memory=False, chunksize=5000):
    mylist.append(chunk)
df = pd.concat(mylist, axis=0)
df.name = 'df'
del mylist

In [13]:
# Delete row where DSDECOD is NA
df = df[df.DSDECOD != np.nan]

In [27]:
df.iloc[914, 29:60]

INCLAS_DRUGS_FOR_OBSTRUCTIVE_AIRWAY_DISEASES        0.0
INCLAS_EXTRACORPOREAL_MEMBRANE_OXYGENATION          1.0
INCLAS_HIGH_FLOW_OXYGEN_NASAL_CANNULA               1.0
INCLAS_IMMUNOGLOBULINS                              0.0
INCLAS_IMMUNOSTIMULANTS                             0.0
INCLAS_IMMUNOSUPPRESSANTS                           1.0
INCLAS_INSERTION_OF_TRACHEOSTOMY_TUBE               1.0
INCLAS_INTUBATION                                   0.0
INCLAS_LIPID_MODIFYING_AGENTS                       0.0
INCLAS_MUSCLE_RELAXANTS                             0.0
INCLAS_NONINVASIVE_POSITIVE_PRESSURE_VENTILATION    0.0
INCLAS_NONINVASIVE_VENTILATION                      1.0
INCLAS_OTHER_RESPIRATORY_SYSTEM_PRODUCTS            1.0
INCLAS_OXYGEN                                       1.0
INCLAS_PERCUTANEOUS_ENDOSCOPIC_GASTROSTOMY          0.0
INCLAS_PRONE_BODY_POSITION                          1.0
INCLAS_PSYCHOLEPTICS                                0.0
INCLAS_REMOVAL_OF_ENDOTRACHEAL_TUBE             

In [23]:
df = df.apply(pd.to_numeric)

ValueError: Unable to parse string "<7" at position 914

## 3. Model 1: Logistic regression

Model that use method to fill NA => risk of bias because NA not at random.

### Basic logistic regression

In [14]:
# Perform Logistic regression

# Split dataset in features and target variable
X_reg = df.loc[:, df.columns != 'DSDECOD'].to_numpy()
y_reg = df['DSDECOD'].to_numpy()

# Fill NA values
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X_reg)

# Split X and y into training and testing sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=16)

# Create pipeline to standardize and make logistic regression
pipe_reg = Pipeline([('scl', StandardScaler()), 
                     ('clf', LogisticRegression())])

# Fit data into the model
pipe_reg.fit(X_reg_train, y_reg_train)

# Predicting values
y_reg_pred = pipe_reg.predict(X_reg_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_reg_pred, y_reg_test)
print('Accuracy score for logistic regression : ',accuracy_score)

ValueError: could not convert string to float: '<7'

In [None]:
# See coeffs of the model
pipe_reg.named_steps['clf'].coef_

### Play with logistic regression parameters

In [None]:
# Perform Logistic regression with cross-validation

# Split dataset in features and target variable
X_reg = df_reg.loc[:, df_reg.columns != 'DSDECOD'].to_numpy()
y_reg = df_reg['DSDECOD'].to_numpy()

# Fill NA values
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X_reg)

# Split X and y into training and testing sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=16)

# Create pipeline to standardize and make logistic regression
pipe_reg = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression())])

# Set parameters to test
param_reg = {'clf__loss': ['log'],
             'clf__penalty': [None, 'l1', 'l2', 'elasticnet'],
             'clf__alpha': np.linspace(0.15, 0.35),
             'clf__n_iter': [3, 5, 7]}

# Cross-validation
cv_reg = RandomizedSearchCV(estimator = pipe_reg, 
                                         param_distributions=param_reg, 
                                         cv=3, n_iter=30, n_jobs=-1)

# Fit data into the model
cv_reg.fit(X_reg_train, y_reg_train)

# Predicting values
y_reg_pred = cv_reg.predict(X_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_reg_pred, y_reg_test)
print('Accuracy score for logistic regression : ',accuracy_score)

In [None]:
# See best parameters of the model
g_search.best_params_

In [None]:
# See coeffs of the model
cv_reg.named_steps['clf'].coef_

## 4. Model 2: HistGradientBoostingClassifier

Estimators that allow NaN values for type classifier.

In [None]:
# Perform HistGradientBoostingClassifier

# Split dataset in features and target variable
X_hgbc = df.loc[:, df.columns != 'DSDECOD'].to_numpy()
y_hgbc = df['DSDECOD'].to_numpy()

# Split X and y into training and testing sets
X_hgbc_train, X_hgbc_test, y_hgbc_train, y_hgbc_test = train_test_split(X_hgbc, y_hgbc, test_size=0.3, random_state=16)

# Create pipeline to standardize and make logistic regression
pipe_hgbc = Pipeline([('scl', StandardScaler()), 
                     ('clf', HistGradientBoostingClassifier())])

# Fit data into the model
pipe_hgbc.fit(X_hgbc_train, y_hgbc_train)

# Predicting values
y_hgbc_pred = pipe_hgbc.predict(X_hgbc_test)

# Calculate accuracy score
accuracy_score = accuracy_score(y_hgbc_pred, y_hgbc_test)
print('Accuracy score for logistic regression : ',accuracy_score)

In [None]:
# See coeffs of the model
cv_hgbc.named_steps['clf'].coef_