# Projet Machine Learning

Antonin Arsac \
Mathilde Perez

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

from keras.models import load_model
from tensorflow.keras.optimizers import Adam 

### Chargements des dataframes
\
Ici, df_dnn est le dataframe donné par le réseau de neuronne décrit dans l'article Nature

In [2]:
df_cardio1=pd.read_csv("./data/annotations/cardiologist1.csv")
df_cardio2=pd.read_csv("./data/annotations/cardiologist2.csv")
df_cardior=pd.read_csv("./data/annotations/cardiology_residents.csv")
df_dnn=pd.read_csv("./data/annotations/dnn.csv")
df_emergency=pd.read_csv("./data/annotations/emergency_residents.csv")
df_gold_s=pd.read_csv("./data/annotations/gold_standard.csv")
df_students=pd.read_csv("./data/annotations/medical_students.csv")
df_attributes=pd.read_csv("./data/attributes.csv")

### Predictions

In [3]:
import argparse
#warnings.filterwarnings("ignore")
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from datasets import ECGSequence


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get performance on test set from hdf5')
    parser.add_argument('path_to_hdf5', type=str,
                        help='path to hdf5 file containing tracings')
    parser.add_argument('path_to_model',  # or model_date_order.hdf5
                        help='file containing training model.')
    parser.add_argument('--dataset_name', type=str, default='tracings',
                        help='name of the hdf5 dataset containing tracings')
    parser.add_argument('--output_file', default="./dnn_output.npy",  # or predictions_date_order.csv
                        help='output csv file.')
    parser.add_argument('-bs', type=int, default=32,
                        help='Batch size.')

    args, unk = parser.parse_known_args(["./data/ecg_tracings.hdf5","./data/model/model.hdf5"])
    if unk:
        warnings.warn("Unknown arguments:" + str(unk) + ".")

    # Import data
    seq = ECGSequence(args.path_to_hdf5, args.dataset_name, batch_size=args.bs)
    # Import model
    model = load_model(args.path_to_model, compile=False)
    model.compile(loss='binary_crossentropy', optimizer=Adam())
    y_score = model.predict(seq,  verbose=1)

    # Generate dataframe
    np.save(args.output_file, y_score)

    print("Output predictions saved")


Output predictions saved


In [4]:
df_predictions = pd.DataFrame(data = y_score, columns = ["1dAVb","RBBB","LBBB","SB","AF","ST"])
df_predictions

Unnamed: 0,1dAVb,RBBB,LBBB,SB,AF,ST
0,1.424320e-06,1.071004e-07,2.633703e-07,4.537750e-07,9.485395e-07,6.413525e-09
1,2.889732e-02,2.006710e-03,3.177863e-01,2.827818e-05,4.834345e-02,3.205240e-04
2,3.112853e-04,2.940376e-05,4.175250e-06,1.971315e-05,9.349018e-03,2.493309e-05
3,2.396912e-09,1.734494e-09,6.939355e-10,8.173876e-10,5.682145e-09,2.767275e-10
4,5.306005e-04,3.533459e-06,3.394174e-07,1.430142e-06,2.242625e-04,4.707765e-06
...,...,...,...,...,...,...
822,2.662169e-06,2.271635e-07,1.546323e-07,1.969674e-07,1.191892e-05,3.100990e-08
823,4.383028e-04,1.032207e-05,1.012705e-04,4.085054e-08,8.352697e-04,6.998936e-07
824,6.078317e-07,3.985351e-08,1.985758e-08,1.092500e-08,1.017582e-06,3.607297e-08
825,3.941979e-08,2.405508e-09,8.350511e-10,9.778340e-09,6.802806e-08,3.952344e-09


Pour un soucis de lisibilité nous allons arrondir les valeurs.

In [5]:
df_predictions_arrondies=np.round(df_predictions,decimals=4)

In [6]:
df_predictions_arrondies

Unnamed: 0,1dAVb,RBBB,LBBB,SB,AF,ST
0,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
1,0.0289,0.0020,0.3178,0.0,0.0483,0.0003
2,0.0003,0.0000,0.0000,0.0,0.0093,0.0000
3,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
4,0.0005,0.0000,0.0000,0.0,0.0002,0.0000
...,...,...,...,...,...,...
822,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
823,0.0004,0.0000,0.0001,0.0,0.0008,0.0000
824,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
825,0.0000,0.0000,0.0000,0.0,0.0000,0.0000


### Comparaisons avec les humains

Les résultats des humains sont des 1 ou des 0 donc pour les comparer avec les prédictions nous devont arrondir à 0 ou 1 chaque résultats.

In [7]:
df_predictions_arrondies2=np.round(df_predictions,decimals=0)

In [8]:
def stats(df):
    score=0
    for idy in range(6):
        for idx in range(827):
            if df.values[idx,idy]==df_gold_s.values[idx,idy]:
                score+=1
    return score/(827*6)

In [9]:
print("Score du cardiologue 1 : ",stats(df_cardio1))
print("Score du cardiologue 2 : ",stats(df_cardio2))
print("Score du résident en cardiologie : ",stats(df_cardior))
print("Score dnn : ",stats(df_dnn))
print("Score du résident en urgence : ",stats(df_emergency))
print("Score étudiant : ",stats(df_students))
print("Score de notre dnn : ",stats(df_predictions_arrondies))

Score du cardiologue 1 :  0.9943571140669085
Score du cardiologue 2 :  0.9965739621120516
Score du résident en cardiologie :  0.992341797662233
Score dnn :  0.7847642079806529
Score du résident en urgence :  0.9903264812575574
Score étudiant :  0.988714228133817
Score de notre dnn :  0.7089883111648528


### Etude du NN

##  Comparaison avec d'autres classifieurs

Les données que l'on doit traiter sont ici des données binaires. 

In [10]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.svm import SVC

###  On prépare ici les donnée d'entraînement et de test

In [49]:
with h5py.File('./data/ecg_tracings.hdf5', "r") as f:
    x = np.array(f['tracings'])
    

In [50]:
y_gold_s = df_gold_s['1dAVb'] + df_gold_s.RBBB + df_gold_s.LBBB + df_gold_s.SB + df_gold_s.AF + df_gold_s.ST

In [51]:
Y = y_gold_s


On remarque qu'il y a des valeurs supérieurs à 1 dans le dataframe.

In [52]:
Y[Y.values>1]=1

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size = 0.2, random_state = 0)

In [54]:
nb_index, nbx,nby = np.shape(X_train)
x_train = np.reshape(X_train,((nb_index,nbx*nby)))

nb_index_test, nbx_test,nby_test = np.shape(X_test)
x_test = np.reshape(X_test,((nb_index_test,nbx_test*nby_test)))

### Essai avec un algorithme de type SVM

In [55]:
csvm = SVC(kernel = 'linear', random_state = 2)
csvm.fit(x_train,Y_train)

SVC(kernel='linear', random_state=2)

In [56]:
y_pred_svm = csvm.predict(x_test)

In [59]:
cm = confusion_matrix(Y_test,y_pred_svm)
print(cm)
accuracy_score(Y_test,y_pred_svm)

[[119  14]
 [ 29   4]]


0.7409638554216867

In [60]:
print(classification_report(Y_test,y_pred_svm))

              precision    recall  f1-score   support

           0       0.80      0.89      0.85       133
           1       0.22      0.12      0.16        33

    accuracy                           0.74       166
   macro avg       0.51      0.51      0.50       166
weighted avg       0.69      0.74      0.71       166

