# Importation de Numpy, Pandas et Matplotlib

In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Charger le fichier de donnée

In [103]:
data = pd.read_excel('coeur1.xlsx')
df = data.copy()

# Vérifier s'il y a des doublons, si oui supprimez les 

In [104]:
df.duplicated()


0      False
1      False
2      False
3      False
4      False
       ...  
913    False
914    False
915    False
916    False
917    False
Length: 918, dtype: bool

In [105]:
df.drop_duplicates()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,homme,AA,140,289,0,Normal,172,Non,0.0,Ascendant,0
1,49,femme,DNA,160,180,0,Normal,156,Non,1.0,Plat,1
2,37,homme,AA,130,283,0,ST,98,Non,0.0,Ascendant,0
3,48,femme,ASY,138,214,0,Normal,108,Oui,1.5,Plat,1
4,54,homme,DNA,150,195,0,Normal,122,Non,0.0,Ascendant,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,homme,AT,110,264,0,Normal,132,Non,1.2,Plat,1
914,68,homme,ASY,144,193,1,Normal,141,Non,3.4,Plat,1
915,57,homme,ASY,130,131,0,Normal,115,Oui,1.2,Plat,1
916,57,femme,AA,130,236,0,LVH,174,Non,0.0,Plat,1


# Vérifier s'il y a des données manquetes, si oui supprimez les

In [106]:
#isnull().any.any si le résultat obtenu est false, il n'y a pas de données manquante. si le résultat obtenu est True c'est qu'il y en a

#df.isnull().any().any()
df.isna()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
913,False,False,False,False,False,False,False,False,False,False,False,False
914,False,False,False,False,False,False,False,False,False,False,False,False
915,False,False,False,False,False,False,False,False,False,False,False,False
916,False,False,False,False,False,False,False,False,False,False,False,False


In [107]:
#Ou alors, nous voudrons peut-être faire une vérification rapide pour voir s’il nous reste des valeurs manquantes.

# aussi on peut utilisé

df.isnull().values.any()

False

# Vérifier s'il n'y pas de constantes

In [108]:
df.nunique()

AGE             50
SEXE             2
TDT              4
PAR             67
CHOLESTEROL    222
GAJ              2
ECG              3
FCMAX          119
ANGINE           2
DEPRESSION      53
PENTE            3
CŒUR             2
dtype: int64

# Vérifier les valeurs manquante

In [109]:
df.isna()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
913,False,False,False,False,False,False,False,False,False,False,False,False
914,False,False,False,False,False,False,False,False,False,False,False,False
915,False,False,False,False,False,False,False,False,False,False,False,False
916,False,False,False,False,False,False,False,False,False,False,False,False


In [110]:
df.isna().value_counts()

AGE    SEXE   TDT    PAR    CHOLESTEROL  GAJ    ECG    FCMAX  ANGINE  DEPRESSION   PENTE  CŒUR 
False  False  False  False  False        False  False  False  False   False        False  False    918
dtype: int64

#  5. Normaliser les variables quantitative

# cette fonction nous permets de diviser chaque colonne par la valeur max de la colonne
pdf.columns / df.columns.max()


# df['CŒUR'] = df['CŒUR'].astype('object'), nous permet de changer le type de la colonne coeur 

In [111]:
df['CŒUR'] = df['CŒUR'].astype('object')

In [112]:

def ah(df):
    # celà signifie que pour chaque colonne se trouvant dans la liste des colonnes de df
    for col in df.columns:
        # df.select_dtypes('object').columns, selectionne tous les éléments de types objet contenu dans df
        if (col in df.select_dtypes('object').columns) == False:
            df[col] = df[col] / df[col].max()
    return df

ah(df)
    

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,0.519481,homme,AA,0.70,0.479270,0.0,Normal,0.851485,Non,0.000000,Ascendant,0
1,0.636364,femme,DNA,0.80,0.298507,0.0,Normal,0.772277,Non,0.161290,Plat,1
2,0.480519,homme,AA,0.65,0.469320,0.0,ST,0.485149,Non,0.000000,Ascendant,0
3,0.623377,femme,ASY,0.69,0.354892,0.0,Normal,0.534653,Oui,0.241935,Plat,1
4,0.701299,homme,DNA,0.75,0.323383,0.0,Normal,0.603960,Non,0.000000,Ascendant,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.584416,homme,AT,0.55,0.437811,0.0,Normal,0.653465,Non,0.193548,Plat,1
914,0.883117,homme,ASY,0.72,0.320066,1.0,Normal,0.698020,Non,0.548387,Plat,1
915,0.740260,homme,ASY,0.65,0.217247,0.0,Normal,0.569307,Oui,0.193548,Plat,1
916,0.740260,femme,AA,0.65,0.391376,0.0,LVH,0.861386,Non,0.000000,Plat,1


# 6.Recoder les variables qualitative

In [113]:
df['CŒUR'] = df['CŒUR'].astype('int') # df['CŒUR'] = df['CŒUR'].astype('object'), celà nous permet de reconvertir le type du coeur

In [114]:
# def recode_serie(serie):
    
#return serie.astype("categiry").cat.codes

#elle nous permet de recoder une colonne de mon df, une serie == colonne

In [115]:
def recode_serie(serie):
    
    return serie.astype("category").cat.codes

In [116]:
def recode_variable_qualitative(df):
    
    
    for col in df.select_dtypes("object").columns:
        df[col] = recode_serie(df[col])
        
    return df
recode_variable_qualitative(df)

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,0.519481,1,0,0.70,0.479270,0.0,1,0.851485,0,0.000000,0,0
1,0.636364,0,3,0.80,0.298507,0.0,1,0.772277,0,0.161290,2,1
2,0.480519,1,0,0.65,0.469320,0.0,2,0.485149,0,0.000000,0,0
3,0.623377,0,1,0.69,0.354892,0.0,1,0.534653,1,0.241935,2,1
4,0.701299,1,3,0.75,0.323383,0.0,1,0.603960,0,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.584416,1,2,0.55,0.437811,0.0,1,0.653465,0,0.193548,2,1
914,0.883117,1,1,0.72,0.320066,1.0,1,0.698020,0,0.548387,2,1
915,0.740260,1,1,0.65,0.217247,0.0,1,0.569307,1,0.193548,2,1
916,0.740260,0,0,0.65,0.391376,0.0,0,0.861386,0,0.000000,2,1


# Decoupage dans x

In [117]:
X = df.iloc[:, :-1]
X

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE
0,0.519481,1,0,0.70,0.479270,0.0,1,0.851485,0,0.000000,0
1,0.636364,0,3,0.80,0.298507,0.0,1,0.772277,0,0.161290,2
2,0.480519,1,0,0.65,0.469320,0.0,2,0.485149,0,0.000000,0
3,0.623377,0,1,0.69,0.354892,0.0,1,0.534653,1,0.241935,2
4,0.701299,1,3,0.75,0.323383,0.0,1,0.603960,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...
913,0.584416,1,2,0.55,0.437811,0.0,1,0.653465,0,0.193548,2
914,0.883117,1,1,0.72,0.320066,1.0,1,0.698020,0,0.548387,2
915,0.740260,1,1,0.65,0.217247,0.0,1,0.569307,1,0.193548,2
916,0.740260,0,0,0.65,0.391376,0.0,0,0.861386,0,0.000000,2


# Decoupage dans Y

In [118]:
Y = df['CŒUR']
Y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: CŒUR, Length: 918, dtype: int32

# Repartir les datasect

In [119]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Entraînement du modèle de regression logistque

In [120]:
# entraîner le modèl

prototype = LogisticRegression(random_state=0)

prototype.fit(X_train, Y_train)
print(prototype.score(X_train, Y_train))
print(prototype.score(X_test, Y_test))

0.8660436137071651
0.8260869565217391


In [121]:
#Créer une matrice de confusion la précision et la sensibilité
# confusion
# Précision

In [123]:
confusion_matrix(Y_test, prototype.predict(X_test))

array([[ 91,  22],
       [ 26, 137]], dtype=int64)

In [124]:
precision_score(Y_test, prototype.predict(X_test))

0.8616352201257862

In [125]:
recall_score(Y_test, prototype.predict(X_test))

0.8404907975460123