In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import recall_score,precision_score,classification_report
from sklearn.metrics import confusion_matrix


In [116]:
#importation des données
data=pd.read_excel("Coeur.xlsx")

In [117]:
#copie des données 
df=data.copy()

In [118]:
df.head(10)

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,homme,AA,140,289,0,Normal,172,Non,0.0,Ascendant,0
1,49,femme,DNA,160,180,0,Normal,156,Non,1.0,Plat,1
2,37,homme,AA,130,283,0,ST,98,Non,0.0,Ascendant,0
3,48,femme,ASY,138,214,0,Normal,108,Oui,1.5,Plat,1
4,54,homme,DNA,150,195,0,Normal,122,Non,0.0,Ascendant,0
5,39,homme,DNA,120,339,0,Normal,170,Non,0.0,Ascendant,0
6,45,femme,AA,130,237,0,Normal,170,Non,0.0,Ascendant,0
7,54,homme,AA,110,208,0,Normal,142,Non,0.0,Ascendant,0
8,37,homme,ASY,140,207,0,Normal,130,Oui,1.5,Plat,1
9,48,femme,AA,120,284,0,Normal,120,Non,0.0,Ascendant,0


# 1-verification de doublon et suppression 

In [119]:
#duplicated permet de verifier si des lignes sont identique et sum permet de compter toute les lignes qui sont dupliqué
df.duplicated().sum()

0

# 2- vérification des valeurs manquantes 

In [120]:
#vérification de valeurs manquantes
df.isna().sum()

AGE            0
SEXE           0
TDT            0
PAR            0
CHOLESTEROL    0
GAJ            0
ECG            0
FCMAX          0
ANGINE         0
DEPRESSION     0
PENTE          0
CŒUR           0
dtype: int64

# 3-vérification des constantes et si oui suppression

In [121]:
#vérification de constante
def constante(df):
    list_const=[]
    for col in df.columns:
        if len(df[col].unique())<1:
            list_const.append(col)
        else:
            pass
    return len(list_const)
            
        
#df.nunique()    

In [122]:
constante(df)

0

# 4- récodons les variables qualitative

In [123]:
#division de nos feature en variable qualitative et numerique
var_numer=df._get_numeric_data().columns
var_qual=list(set(df.columns) - set(var_numer))

In [124]:
# l'encodage des variables qualitative
def recoder(serie):
    return serie.astype('category').cat.codes

In [125]:
def encodage(df):
    for  i in df.select_dtypes("object").columns:
        df[i]=recoder(df[i])
    return df

In [126]:
encodage(df)
    

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,1,0,140,289,0,1,172,0,0.0,0,0
1,49,0,3,160,180,0,1,156,0,1.0,2,1
2,37,1,0,130,283,0,2,98,0,0.0,0,0
3,48,0,1,138,214,0,1,108,1,1.5,2,1
4,54,1,3,150,195,0,1,122,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,2,110,264,0,1,132,0,1.2,2,1
914,68,1,1,144,193,1,1,141,0,3.4,2,1
915,57,1,1,130,131,0,1,115,1,1.2,2,1
916,57,0,0,130,236,0,0,174,0,0.0,2,1


# NORMALISATION DES VARIABLES QUANTITATIVES

In [127]:
def normalisation(df):
    for col in var_numer:
        if col=='CŒUR':
            pass
        else:
             
            df[col]=df[col]/df[col].max()
    return df

In [128]:
normalisation(df)

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,0.519481,1,0,0.70,0.479270,0.0,1,0.851485,0,0.000000,0,0
1,0.636364,0,3,0.80,0.298507,0.0,1,0.772277,0,0.161290,2,1
2,0.480519,1,0,0.65,0.469320,0.0,2,0.485149,0,0.000000,0,0
3,0.623377,0,1,0.69,0.354892,0.0,1,0.534653,1,0.241935,2,1
4,0.701299,1,3,0.75,0.323383,0.0,1,0.603960,0,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.584416,1,2,0.55,0.437811,0.0,1,0.653465,0,0.193548,2,1
914,0.883117,1,1,0.72,0.320066,1.0,1,0.698020,0,0.548387,2,1
915,0.740260,1,1,0.65,0.217247,0.0,1,0.569307,1,0.193548,2,1
916,0.740260,0,0,0.65,0.391376,0.0,0,0.861386,0,0.000000,2,1


# Division des données en train et test

In [129]:
# division de notre dataset en feature et target
X=df.drop("CŒUR",axis=1)

In [130]:
Y=df["CŒUR"]

In [131]:
# repatition de nos données en données d'entrainement train et de test 
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=5)

In [132]:
X_test

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE
236,0.532468,1,1,0.60,0.557214,0.0,1,0.584158,1,0.483871,2
151,0.623377,1,0,0.50,0.263682,0.0,1,0.495050,0,0.000000,0
329,0.779221,1,1,0.65,0.000000,1.0,2,0.643564,1,0.177419,1
416,0.818182,1,1,0.70,0.431177,0.0,2,0.554455,1,0.483871,2
795,0.545455,1,3,0.60,0.398010,1.0,1,0.960396,0,0.129032,1
...,...,...,...,...,...,...,...,...,...,...,...
803,0.805195,0,1,0.70,0.653400,0.0,0,0.777228,0,0.193548,2
521,0.792208,1,1,0.60,0.467662,0.0,2,0.668317,1,0.645161,1
249,0.636364,1,1,0.65,0.565506,0.0,1,0.594059,1,0.161290,2
588,0.870130,1,1,0.70,0.363184,0.0,2,0.603960,1,0.322581,2


In [133]:
X_test.to_csv(r"C:\Users\cyberTechs\Desktop\DATASET\X_test.csv",index=None,header=True)

In [134]:
# les dimensions de nos jeux données repartie
print("les dimensions de X_train sont {}".format(X_train.shape))
print("les dimensions de X_test sont {}".format(X_test.shape))
print("les dimensions de Y_train sont {}".format(Y_train.shape))
print("les dimensions de Y_test sont {}".format(Y_test.shape))
X_testt=np.array(X_test)
X_testt.ndim

les dimensions de X_train sont (642, 11)
les dimensions de X_test sont (276, 11)
les dimensions de Y_train sont (642,)
les dimensions de Y_test sont (276,)


2

# ENTRAINEMENT DU MODEL SANS STANDARDISER LES DONNEES 

# ENTRAINEMENT DU MODEL

In [135]:
#choisir un model et l'instancié arbre_deci = tree.DecisionTreeClasifier()
decision_tree = tree.DecisionTreeClassifier()

In [136]:
#entrainement de notre model
decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier()

In [137]:
prediction= decision_tree.predict(X_test)

In [138]:
#le score de notre model
score = decision_tree.score(X_test, Y_test)
print(score)

0.8188405797101449


# EVALUATION DE NOTRE MODEL

In [139]:
precision = precision_score(Y_test, prediction)
recall = recall_score(Y_test, prediction)

print('Precision: ',precision)
print('Recall: ',recall)


Precision:  0.8689655172413793
Recall:  0.802547770700637


# CONFUSION MATRIX

In [140]:
# la matrice de confusion pour mieux apprecier la performance de notre model
CM = confusion_matrix(Y_test, prediction)
#predict_value=pd.Series(logisticRegr.predict(X_test),name="prediction")
#df_confusion = pd.crosstab(Y_test, predict_value)
#df_confusion
CM

array([[100,  19],
       [ 31, 126]], dtype=int64)

# ENTRAINEMENT DU MODEL SUR LES DONNEES STANDARDISER 

In [141]:
# STANTARDISONS NOTRE DF
standa_scaler=StandardScaler()
X_train2=standa_scaler.fit_transform(X_train)
X_test2=standa_scaler.fit_transform(X_test)
len(X_test2)

276

In [142]:
#Model arbre de décision 2
model2=tree.DecisionTreeClassifier()

In [143]:
model2.fit(X_train2,Y_train)

DecisionTreeClassifier()

In [144]:
model2.score(X_test2,Y_test)

0.8007246376811594

In [151]:
prediction2=model2.predict(X_test)

In [155]:
precision2 = precision_score(Y_test, prediction2)
recall2 = recall_score(Y_test, prediction2)

print('Precision: ',precision2)
print('Recall: ',recall2)


Precision:  0.8793103448275862
Recall:  0.6496815286624203


# ENREGISTREMENT DE NOTRE MODEL AVEC PICKLE

In [146]:
#pickle.dump(logisticRegr,open("model.pkl","wb"))

In [147]:
#Z=np.array([0.24,0.4,1,0.12,0,1,0.5,0.78,1,0,1])
#Z=Z.reshape(1,-1)
#Z

In [148]:
#logisticRegr = logisticRegr.predict(Z)

In [149]:
#x=[int(x) for x in range(11)]

In [150]:
#x_np=np.array(x).reshape(-1,1)
#x_np.ndim