In [1]:
# Import
%matplotlib inline

import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

%load_ext autoreload
%autoreload 2

## 1. Loading the data

In [2]:
# Open file
data_folder = './data/results/'
mylist = []
for chunk in pd.read_csv(data_folder + 'df_final_I-DataSelection.csv', sep=',', low_memory=False, chunksize=5000):
    mylist.append(chunk)
df = pd.concat(mylist, axis=0)
df.name = 'df'
del mylist
df = df[df.DSDECOD != np.nan]

## 2. Processing of the data

In [3]:
categorical_data = pd.Series(['SEX', 'CONTINENT_AF', 'CONTINENT_AS', 'CONTINENT_EU', 'CONTINENT_NA', 'CONTINENT_OC',
                    'CONTINENT_SA','HODECOD', 'INCLAS_AGENTS_ACTING_ON_THE_RENIN-ANGIOTENSIN_SYSTEM', 'INCLAS_ANALGESICS',
                    'INCLAS_ANESTHETICS', 'INCLAS_ANTIBACTERIALS_FOR_SYSTEMIC_USE', 'INCLAS_ANTIHELMINTICS', 'INCLAS_ANTIINFLAMMATORY_AND_ANTIRHEUMATIC_PRODUCTS,_NON-STEROIDS',
                    'INCLAS_ANTIMALARIALS', 'INCLAS_ANTIMYCOTICS_FOR_SYSTEMIC_USE', 'INCLAS_ANTITHROMBOTIC_AGENTS',
                    'INCLAS_ANTIVIRALS_FOR_SYSTEMIC_USE', 'INCLAS_ARTIFICIAL_RESPIRATION', 'INCLAS_BETA_BLOCKING_AGENTS',
                    'INCLAS_BLOOD_SUBSTITUTES_AND_PERFUSION_SOLUTIONS', 'INCLAS_BRONCHOSCOPY', 'INCLAS_CARDIAC_PACING',
                    'INCLAS_CARDIAC_THERAPY', 'INCLAS_CARDIOPULMONARY_RESUSCITATION', 'INCLAS_CHEMOTHERAPY', 'INCLAS_CORTICOSTEROIDS_FOR_SYSTEMIC_USE',
                    'INCLAS_DIURETICS', 'INCLAS_DRUGS_FOR_ACID_RELATED_DISORDERS', 'INCLAS_DRUGS_FOR_OBSTRUCTIVE_AIRWAY_DISEASES', 'INCLAS_EXTRACORPOREAL_MEMBRANE_OXYGENATION', 'INCLAS_HIGH_FLOW_OXYGEN_NASAL_CANNULA'
                    'INCLAS_IMMUNOGLOBULINS', 'INCLAS_IMMUNOSTIMULANTS', 'INCLAS_IMMUNOSUPPRESSANTS', 'INCLAS_INSERTION_OF_TRACHEOSTOMY_TUBE', 'INCLAS_INTUBATION',
                    'INCLAS_LIPID_MODIFYING_AGENTS', 'INCLAS_MUSCLE_RELAXANTS', 'INCLAS_NONINVASIVE_POSITIVE_PRESSURE_VENTILATION', 'INCLAS_NONINVASIVE_VENTILATION', 'INCLAS_OTHER_RESPIRATORY_SYSTEM_PRODUCTS', 'INCLAS_OXYGEN',
                    'INCLAS_PERCUTANEOUS_ENDOSCOPIC_GASTROSTOMY', 'INCLAS_PRONE_BODY_POSITION', 'INCLAS_PSYCHOLEPTICS', 'INCLAS_REMOVAL_OF_ENDOTRACHEAL_TUBE', 'INCLAS_RENAL_REPLACEMENT', 'INCLAS_REPLACEMENT_AGENT',
                    'INCLAS_TOTAL_PARENTERAL_NUTRITION', 'INCLAS_TRANSFUSION_OF_BLOOD_PRODUCT', 'INCLAS_VACCINES',
                    'MBTEST_ADENOVIRUS', 'MBTEST_BACTERIA','MBTEST_INFLUENZA', 'MBTEST_MB_SEVERE_ACUTE_RESP_SYND_CORONAVIRUS',
                    'MBTEST_OTHER PATHOGENS', 'MBTEST_OTHER RESPIRATORY PATHOGENS', 'MBTEST_RSV', 'RPSTRESC', 'RSCAT_AVPU', 'SACAT_COMORBIDITIES',
                    'SACAT_COMPLICATIONS', 'SACAT_PREVIOUS_COVID-19_INFECTION', 'SCTEST_Appropriate_Developmental_Age_Indicator', 'SCTEST_Breast_Fed_Indicator', 
                    'SCTEST_Infant_Less_Than_One_Year_Indicator', 'SCTEST_Premature_Birth_Indicator'])

In [4]:
X_ = df.loc[:, df.columns != 'DSDECOD']
y_class = df['DSDECOD']

In [5]:
#check if labels in dataFrame
columns_label = X_.columns
labels_in = np.isin(categorical_data, columns_label)
categorical_data = categorical_data[labels_in]

In [6]:
#continuous data
labels_in = np.isin(columns_label, categorical_data)
continuous_data = columns_label[labels_in ==False]

### 2.1 Filling the missing data

#### 2.1.1 Categorical data

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imp_cat = SimpleImputer(strategy = "most_frequent")
df_cat = imp_cat.fit_transform(X_[categorical_data])

#### 2.1.2 Continuous data

In [9]:
imp_con = SimpleImputer(strategy = "median")
df_con = imp_cat.fit_transform(X_[continuous_data])

### 2.2 Standardization of the continuous data

In [10]:
df_con = (df_con - df_con.mean())/df_con.std()

### 2.3 Separating the features from the outcome

In [11]:
X_cat = pd.DataFrame(df_cat, columns = categorical_data)
X_con = pd.DataFrame(df_con, columns = continuous_data)

In [12]:
X_class = pd.concat([X_cat, X_con], axis=1)

les propositions sont faites à partir de: https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron

In [14]:
classifiers = ['LogisticRegression()', 'GradientBoostingClassifier()',
              'KNeighborsClassifier()', 'GaussianNB()', 'RandomForestClassifier()',
              'svm.SVC()', 'svm.LinearSVC()', 'MLPClassifier()', 'DecisionTreeClassifier()',
              'DecisionTreeClassifier()', 'ExtraTreesClassifier()', 'LinearDiscriminantAnalysis()',
              'QuadraticDiscriminantAnalysis()', 'SGDClassifier()', 'xgb.XGBClassifier(objective="binary:logistic", random_state=42)',
              'GaussianProcessClassifier()', 'PassiveAggressiveClassifier()', 'Perceptron()']

## 3. Recursive feature elimination

In [33]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

In [19]:
# Build a classification task using 3 informative features
X, y = X_class, y_class

# Create the RFE object and compute a cross-validated score.
classifier = #à remplir
# The "accuracy" scoring shows the proportion of correct classifications

min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=classifier, step=1, cv=KFold(2),scoring="average_precision",
    min_features_to_select=min_features_to_select)

feature_names = np.array(X.columns)
print(f"Features selected by Recursive feature elimination: {feature_names[rfecv.get_support()]}")

print("Optimal number of features : %d" % rfecv.n_features_)

KeyboardInterrupt: 

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (average precision)")
plt.plot(range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),rfecv.grid_scores_)
plt.show()

In [None]:
df_sfm = df[feature_names[sfm.get_support()]]
df_sfm = pd.concat([df_sfm, y_reg], axis=1)
df_sfm.to_csv('./data/results/df_forward-RFECV.csv')

## 4. Sequential Feature Selection

In [33]:
from sklearn.feature_selection import SelectFromModel

In [35]:
X, y = X_class, y_class

classification = #à remplir.fit(X, y)
importance = np.abs(classification.coef_)

threshold = np.sort(importance)[-3] + 0.01
feature_names = np.array(X.columns)
sfm = SelectFromModel(classification, threshold=threshold).fit(X, y)
print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")

Features selected by SelectFromModel: []


or

In [50]:
from sklearn.feature_selection import SequentialFeatureSelector

In [52]:
sfs_forward = SequentialFeatureSelector(ridge, n_features_to_select=20, direction="forward").fit(X, y)

sfs_backward = SequentialFeatureSelector(ridge, n_features_to_select=20, direction="backward").fit(X, y)

In [15]:
feature_names = X.columns
print("Features selected by forward sequential selection: "f"{feature_names[sfs_forward.get_support()]}")

print("Features selected by backward sequential selection: "f"{feature_names[sfs_backward.get_support()]}")

NameError: name 'X' is not defined

In [None]:
df_sfm = df[feature_names[sfm.get_support()]]
df_back = df[feature_names[sfs_backward.get_support()]]
df_forw = df[feature_names[sfs_forward.get_support()]]

In [None]:
df_sfm = pd.concat([df_sfm, y_reg], axis=1)
df_back = pd.concat([df_back, y_reg], axis=1)
df_forw = pd.concat([df_forw, y_reg], axis=1)

In general, the two methods would lead to different results.

We also note that the features selected by SFS differ from those selected by feature importance.

To finish with, we should note that SelectFromModel is significantly faster than SFS. Indeed, SelectFromModel only needs to fit a model once, while SFS needs to cross-validate many different models for each of the iterations. SFS however works with any model, while SelectFromModel requires the underlying estimator to expose a coef_ attribute or a feature_importances_ attribute. The forward SFS is faster than the backward SFS because it only needs to perform n_features_to_select = 2 iterations, while the backward SFS needs to perform n_features - n_features_to_select = 8 iterations.

In [None]:
#register the files
df_forw.to_csv('./data/results/df_forward-FeatureSelection.csv')
df_back.to_csv('./data/results/df_backward-FeatureSelection.csv')
df_sfm.to_csv('./data/results/df_final_I-DataSelection.csv')