In [1]:
import pandas as pd
import pathlib
import numpy as np
import json
import re
from matplotlib import pyplot
from tqdm import tqdm
import itertools
from collections import Counter

In [2]:
import tensorflow as tf 
tf.__version__

'2.1.0-rc1'

In [3]:
pd.set_option('display.max_columns', 200)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score,classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Import des données et nettoyage

In [7]:
# import des données par un fichier CSV déjà cleané
df_swt = pd.read_csv("./swt_v2_cleaned.csv").drop('Unnamed: 0', axis=1)

In [8]:
patient_kept = [8544,9839,10418]  ### à ce stade de l'étude, on avait décidé d'effectuer le "feature selection sur un groupe réduit de patient "## 9578 ##9104 #11077

In [9]:
#filtrage sur le(s) patient(s)
df_swt=df_swt[df_swt['Patient'].isin(patient_kept)]

In [10]:
#suppression des lignes où le label est vide
df_swt.dropna(subset=['label'],inplace=True)

In [11]:
#remplacement des valeurs de label par 1 si c'est différent de 0
df_swt['label']= df_swt['label'].apply(lambda x: 0 if (x==0) else 1)

In [12]:
#remplacement des valeurs "inf" par Nan
df_swt = df_swt.replace([np.inf, -np.inf], np.nan)

In [13]:
# remplacement des valeurs "Nan" par la moyenne de l'examen

df_swt2 = df_swt.groupby(df_swt['cle_exam']).transform(lambda x: x.fillna(x.mean()))

In [14]:
#on remet la clé qui a disparu au moment du groupby
df_swt2['cle_exam'] = df_swt['cle_exam'].values

## Temporalisation des données

In [15]:
#definition d'une fonction pour transformer les series temporelles:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
	# input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('%s(t-%d)' % (j, i)) for j in df.columns]
	# forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('%s(t)' % (j)) for j in df.columns]
        else:
            names += [('%s(t+%d)' % (j, i)) for j in df.columns]
	# put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
	# drop rows with NaN values
    if dropnan:
        agg = agg.iloc[n_in:,]
    return agg

In [16]:
#definition d'une variable qui indique le nombre de périodes que nous allons prendre en compte
nb_period = 12

In [17]:
# creation d'un dataframe où les series sont temporalisée examen par examen
df_swt_temp = pd.DataFrame()

for i in df_swt.cle_exam.unique():
    df_swt_temp = pd.concat([df_swt_temp,series_to_supervised(df_swt2[df_swt2['cle_exam']== i],nb_period,1)])

## Cleaning des colonnes du data set après avoir appliqué la temporalisation

###### Liste des dropped_columns/ features avec toutes les variables

In [18]:
dropped_columns=['interval_index','Dossier Patient','exam_duration','Pan_vs_SWT','SWT_vs_XQRS',
                 'interval_start_time','max_duration','mean_nni%','csi%','hf%']

features_non_temp =['cle_exam']

numerical_feat = ['mean_nni','sdnn','sdsd','nni_50','pnni_50','nni_20','pnni_20','rmssd',
                  'median_nni','range_nni','cvsd','cvnni','mean_hr','max_hr','min_hr','std_hr',
                  'lf','hf','vlf','lf_hf_ratio',
                  'csi','cvi','Modified_csi','sd1','sd2','ratio_sd2_sd1','sampen']

categorical_feat = ['Patient']
features_temp = numerical_feat+categorical_feat

In [20]:
#definition de la liste des colonnes à dropper (dynamique en fonction du nombre de période)
dropped_columns_temp = [s+ "(t-"+str(j)+")" for s in dropped_columns for j in range(1,nb_period+1)]
dropped_columns_temp += [s+ "(t)" for s in dropped_columns]
#dropped_columns_temp += ["Patient(t-"+str(j)+")" for j in range(1,nb_period+1)]

In [21]:
df_swt_temp =df_swt_temp.drop(dropped_columns_temp,axis = 1)

## Génération des X et Y, puis séparation en Train et Test

In [22]:
# constitution de la liste des features qui rajoute les (t-n)

features_list = ['%s(t-%d)' % (i,j) for i in features_temp for j in range(1,nb_period+1)]
features_list +=  ['%s(t)' % i for i in features_temp]
features_list +=  ['%s(t)' % i for i in features_non_temp]
features_list = sorted(features_list)
target_variable = "label(t)"

X = df_swt_temp.loc[:,features_list]
Y = df_swt_temp.loc[:,target_variable]

In [23]:
## split test/train

X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state =42, stratify = Y)


In [24]:
# calculate basic figures to share ours results
print("nb exams sur le train: ",X_train['cle_exam(t)'].unique().shape[0])
print("nb exams sur le test: ",X_test['cle_exam(t)'].unique().shape[0])
print("nb lignes sur le train: ",X_train.shape[0])
print("nb lignes sur le test: ",X_test.shape[0])

nb exams sur le train:  107
nb exams sur le test:  107
nb lignes sur le train:  5476
nb lignes sur le test:  1370


In [25]:
#drop "cle_exam" (je la gardais pour pouvoir calculer le nb d'exams et de lignes juste avant)
cle_exam_list = ['cle_exam(t)']

X_train = X_train.drop(cle_exam_list,axis=1)
X_test = X_test.drop(cle_exam_list,axis=1)

# Entrainement en boucle d'un Decision Tree en testant toutes les combinaisons de features à 2, 3 ou 4 features

In [29]:
# definition d'une liste de toutes les combinaisons de  N features
from random import sample

# on règle ici la taille des listes. plus le chiffre est important, plus c'est long
N = 6
list_permuts =[]
i = 0
for subset in itertools.combinations(numerical_feat, N):
    list_permuts.append(subset)

#on limite le nombre de combinaison à tester à 5000

if len(list_permuts)>5000:
    list_permuts = sample(list_permuts,5000)

print(len(list_permuts))

5000


In [31]:
###définition d'une fonction qui gère l'encoding de façon un peu plus dynamique

def my_normalisation_ecoding(X_train, X_test, numeric_feat, categoric_feat, nb_t):

    # Normalization
    numeric_transformer = StandardScaler()
    # OHE / dummyfication
    categorical_indices =[X_train.columns.get_loc('%s(t-%d)' %  (i,j)) for i in categoric_feat for j in range(1,nb_t+1)]
    categorical_indices += [X_train.columns.get_loc('%s(t)' %  i) for i in categoric_feat]
    categorical_indices = sorted(categorical_indices)

    numeric_indices =[X_train.columns.get_loc('%s(t-%d)' %  (i,j)) for i in numeric_feat for j in range(1,nb_t+1)]
    numeric_indices += [X_train.columns.get_loc('%s(t)' %  i) for i in numeric_feat]
    numeric_indices = sorted(numeric_indices)


    categorical_transformer = OneHotEncoder(drop='first')

    featureencoder = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_indices),    
            ('num', numeric_transformer, numeric_indices)
            ]
        )

    X_train_fit = featureencoder.fit_transform(X_train)
    X_test_fit = featureencoder.transform(X_test)
    return X_train_fit, X_test_fit


In [32]:
results_features = pd.DataFrame()

# Boucle d'entrainement sur toutes les combinaisons de features

for list_i in tqdm(list_permuts):
    list_i_full = list(list_i) + categorical_feat
    features_list = ['%s(t-%d)' % (i,j) for i in list_i_full for j in range(1,nb_period+1)]
    features_list +=  ['%s(t)' % i for i in list_i_full]
    features_list = sorted(features_list)
    
    X = df_swt_temp.loc[:,features_list]
    Y = df_swt_temp.loc[:,target_variable]
    
    X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state =42, stratify = Y)

    X_train_fit,X_test_fit = my_normalisation_ecoding(X_train, X_test,list_i, categorical_feat, nb_period)
    
    #classifier = XGBClassifier(n_jobs=8,verbosity=0,learning_rate=0.5,max_depth=10,n_estimators=500)
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train_fit,Y_train)
    Y_train_pred = classifier.predict(X_train_fit)
    Y_test_pred = classifier.predict(X_test_fit)
    # on met tous les résultats dans un DataFrame "results_features"
    results_features = results_features.append(pd.DataFrame({"iteration":[list_i],
                   "confustion matrix train": [confusion_matrix(Y_train, Y_train_pred)],
                   "confustion matrix test": [confusion_matrix(Y_test, Y_test_pred)],
                   "recall score train": [recall_score(Y_train,Y_train_pred)],
                   "recall score test": [recall_score(Y_test,Y_test_pred)],
                   "f1_score train": [f1_score(Y_train,Y_train_pred)],
                   "f1_score test": [f1_score(Y_test,Y_test_pred)],
                   }),ignore_index=True
                            )
#on trie par F1 score descendant
results_features = results_features.sort_values(by='f1_score test',ascending=False)
#petite sauvegarde
results_features.to_csv('./results_{}features.csv'.format(N))

100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [35:44<00:00,  2.33it/s]


##### Résultats: meilleurs combinaisons de features: 

In [None]:
#boucle pour compter le nombre de fois où une feature apparait (sur les 30 meilleurs F1 score puisque c'est trié par f1 score decroissant)
count_features = []
for i in results_features.head(30)['iteration']:
    for j in range(0,len(i)):
        count_features.append(i[j])

Counter(count_features).most_common()