# Projet Machine Learning

Antonin Arsac \
Mathilde Perez

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

from keras.models import load_model
from tensorflow.keras.optimizers import Adam 

### Chargements de dataframe

In [22]:
df_cardio1=pd.read_csv("./data/annotations/cardiologist1.csv")
df_cardio2=pd.read_csv("./data/annotations/cardiologist2.csv")
df_cardior=pd.read_csv("./data/annotations/cardiology_residents.csv")
df_dnn=pd.read_csv("./data/annotations/dnn.csv")
df_emergency=pd.read_csv("./data/annotations/emergency_residents.csv")
df_gold_s=pd.read_csv("./data/annotations/gold_standard.csv")
df_students=pd.read_csv("./data/annotations/medical_students.csv")
df_attributes=pd.read_csv("./data/attributes.csv")

### Predictions

In [13]:
import argparse
#warnings.filterwarnings("ignore")
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from datasets import ECGSequence


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get performance on test set from hdf5')
    parser.add_argument('path_to_hdf5', type=str,
                        help='path to hdf5 file containing tracings')
    parser.add_argument('path_to_model',  # or model_date_order.hdf5
                        help='file containing training model.')
    parser.add_argument('--dataset_name', type=str, default='tracings',
                        help='name of the hdf5 dataset containing tracings')
    parser.add_argument('--output_file', default="./dnn_output.npy",  # or predictions_date_order.csv
                        help='output csv file.')
    parser.add_argument('-bs', type=int, default=32,
                        help='Batch size.')

    args, unk = parser.parse_known_args(["./data/ecg_tracings.hdf5","./model/model.hdf5"])
    if unk:
        warnings.warn("Unknown arguments:" + str(unk) + ".")

    # Import data
    seq = ECGSequence(args.path_to_hdf5, args.dataset_name, batch_size=args.bs)
    # Import model
    model = load_model(args.path_to_model, compile=False)
    model.compile(loss='binary_crossentropy', optimizer=Adam())
    y_score = model.predict(seq,  verbose=1)

    # Generate dataframe
    np.save(args.output_file, y_score)

    print("Output predictions saved")


Output predictions saved


In [39]:
df_predictions = pd.DataFrame(data = y_score, columns = ["1dAVb","RBBB","LBBB","SB","AF","ST"])
df_predictions

Unnamed: 0,1dAVb,RBBB,LBBB,SB,AF,ST
0,1.424320e-06,1.071004e-07,2.633708e-07,4.537746e-07,9.485395e-07,6.413513e-09
1,2.889732e-02,2.006710e-03,3.177863e-01,2.827815e-05,4.834345e-02,3.205240e-04
2,3.112853e-04,2.940376e-05,4.175250e-06,1.971317e-05,9.349018e-03,2.493307e-05
3,2.396902e-09,1.734487e-09,6.939341e-10,8.173860e-10,5.682133e-09,2.767264e-10
4,5.306005e-04,3.533459e-06,3.394171e-07,1.430139e-06,2.242625e-04,4.707761e-06
...,...,...,...,...,...,...
822,2.662169e-06,2.271630e-07,1.546320e-07,1.969670e-07,1.191893e-05,3.100990e-08
823,4.383028e-04,1.032207e-05,1.012705e-04,4.085047e-08,8.352697e-04,6.998916e-07
824,6.078311e-07,3.985351e-08,1.985754e-08,1.092495e-08,1.017582e-06,3.607297e-08
825,3.941971e-08,2.405503e-09,8.350479e-10,9.778303e-09,6.802794e-08,3.952344e-09


Pour un soucis de lisibilité nous allons arrondir les valeurs.

In [40]:
df_predictions_arrondies=np.round(df_predictions,decimals=4)

In [41]:
df_predictions_arrondies

Unnamed: 0,1dAVb,RBBB,LBBB,SB,AF,ST
0,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
1,0.0289,0.0020,0.3178,0.0,0.0483,0.0003
2,0.0003,0.0000,0.0000,0.0,0.0093,0.0000
3,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
4,0.0005,0.0000,0.0000,0.0,0.0002,0.0000
...,...,...,...,...,...,...
822,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
823,0.0004,0.0000,0.0001,0.0,0.0008,0.0000
824,0.0000,0.0000,0.0000,0.0,0.0000,0.0000
825,0.0000,0.0000,0.0000,0.0,0.0000,0.0000


### Comparaisons avec les humains

Les résultats des humains sont des 1 ou des 0 donc pour les comparer avec les prédictions nous devont arrondir à 0 ou 1 chaque résultats.

In [42]:
df_predictions_arrondies2=np.round(df_predictions,decimals=0)

In [54]:
def stats(df):
    score=0
    for idy in range(6):
        for idx in range(827):
            if df.values[idx,idy]==df_gold_s.values[idx,idy]:
                score+=1
    return score/(827*6)

In [62]:
print("Score du cardiologue 1 : ",stats(df_cardio1))
print("Score du cardiologue 2 : ",stats(df_cardio2))
print("Score du résident en cardiologie : ",stats(df_cardior))
print("Score dnn : ",stats(df_dnn))
print("Score du résident en urgence : ",stats(df_emergency))
print("Score étudiant : ",stats(df_students))
print("Score de notre dnn : ",stats(df_predictions_arrondies))

Score du cardiologue 1 :  0.9943571140669085
Score du cardiologue 2 :  0.9965739621120516
Score du résident en cardiologie :  0.992341797662233
Score dnn :  0.7847642079806529
Score du résident en urgence :  0.9903264812575574
Score étudiant :  0.988714228133817
Score de notre dnn :  0.7089883111648528


### Comparaison avec d'autres classifieurs (linear regression, svm, random forest)

In [63]:
#a finir