In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Upload dataset

In [73]:
data = pd.read_excel('../data/Coeur.xlsx')
df = data.copy()

df.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,homme,AA,140,289,0,Normal,172,Non,0.0,Ascendant,0
1,49,femme,DNA,160,180,0,Normal,156,Non,1.0,Plat,1
2,37,homme,AA,130,283,0,ST,98,Non,0.0,Ascendant,0
3,48,femme,ASY,138,214,0,Normal,108,Oui,1.5,Plat,1
4,54,homme,DNA,150,195,0,Normal,122,Non,0.0,Ascendant,0


# Vérifier s'il existe des doublons

In [74]:
df.duplicated(keep=False).value_counts()

False    918
dtype: int64

# Vérifier s'il existe des données manqunates

In [75]:
df.isna().value_counts()

AGE    SEXE   TDT    PAR    CHOLESTEROL  GAJ    ECG    FCMAX  ANGINE  DEPRESSION   PENTE  CŒUR 
False  False  False  False  False        False  False  False  False   False        False  False    918
dtype: int64

# Suprimer les doublons

In [76]:
df.drop_duplicates(keep=False, inplace=True)

# Recoder les variables quantitatives

In [77]:
def recoder(serie):
    return serie.astype('category').cat.codes

In [78]:
def recoder_data(df):
    for i in df.select_dtypes('object').columns:
        df[i] = recoder(df[i])
    return df

In [79]:
df = recoder_data(df)
df.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,1,0,140,289,0,1,172,0,0.0,0,0
1,49,0,3,160,180,0,1,156,0,1.0,2,1
2,37,1,0,130,283,0,2,98,0,0.0,0,0
3,48,0,1,138,214,0,1,108,1,1.5,2,1
4,54,1,3,150,195,0,1,122,0,0.0,0,0


# Noraliser les variables quantitatives

In [80]:
var_quanti = df.select_dtypes(['int64', 'float64'])

# On suprime la serie CŒUR qui n'est pas une variable quantitative
var_quanti = var_quanti.drop('CŒUR', axis=1)

var_quanti

Unnamed: 0,AGE,PAR,CHOLESTEROL,GAJ,FCMAX,DEPRESSION
0,40,140,289,0,172,0.0
1,49,160,180,0,156,1.0
2,37,130,283,0,98,0.0
3,48,138,214,0,108,1.5
4,54,150,195,0,122,0.0
...,...,...,...,...,...,...
913,45,110,264,0,132,1.2
914,68,144,193,1,141,3.4
915,57,130,131,0,115,1.2
916,57,130,236,0,174,0.0


In [81]:
def normalyze_data(df):
    for i in df.select_dtypes(['int64', 'float64']):
        df[i] = df[i] / df[i].max()
    return df

In [82]:
var_quanti = normalyze_data(var_quanti)
var_quanti.head()

Unnamed: 0,AGE,PAR,CHOLESTEROL,GAJ,FCMAX,DEPRESSION
0,0.519481,0.7,0.47927,0.0,0.851485,0.0
1,0.636364,0.8,0.298507,0.0,0.772277,0.16129
2,0.480519,0.65,0.46932,0.0,0.485149,0.0
3,0.623377,0.69,0.354892,0.0,0.534653,0.241935
4,0.701299,0.75,0.323383,0.0,0.60396,0.0


# Créer un dataset X qui a pour valeur de AGE...PENTE et une série Y égale à COEUR

### Datasete X

In [88]:
# On sélectionne toutes les variables sauf COEUR
x = df.drop('CŒUR', axis=1)
x

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE
0,40,1,0,140,289,0,1,172,0,0.0,0
1,49,0,3,160,180,0,1,156,0,1.0,2
2,37,1,0,130,283,0,2,98,0,0.0,0
3,48,0,1,138,214,0,1,108,1,1.5,2
4,54,1,3,150,195,0,1,122,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,2,110,264,0,1,132,0,1.2,2
914,68,1,1,144,193,1,1,141,0,3.4,2
915,57,1,1,130,131,0,1,115,1,1.2,2
916,57,0,0,130,236,0,0,174,0,0.0,2


### Série y

In [89]:
# On sélectionne uniquement là variable COEUR
y = df['CŒUR']
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: CŒUR, Length: 918, dtype: int64

# Régression logistique

In [85]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [110]:
model = LogisticRegression(max_iter=1000 ,random_state=0)

# Entrainemanet du model
model.fit(x_train, y_train)

# Retourne la perfonmace
model.score(x_train, y_train)

0.8629283489096573

# Matrice de confusion, de précision, de sensibilitée

### Matrice de confusion

In [99]:
# On prédit y en fonction de nos données x_test
y_pred = model.predict(x_test)

M_confusion = confusion_matrix(y_test, y_pred)

"""
    ligne1, col1 => Vrai Positif
    ligne1, col2 => Faux Négatif
    ligne2, col1 => Faux Positif
    ligne2, col2 => Vrai Négatif
"""
M_confusion

array([[ 91,  22],
       [ 24, 139]])

### Vrai Positif (VP) et Faux Négatif (FN) 

In [108]:
# On sélectionne le vrai positif dans la matrix confusion M_confusion
vp = M_confusion[0, 0]

# On sélectionne le faux négatif dans la matrix confusion M_confusion
fn = M_confusion[0, 1]

# On sélectionne le faux négatif dans la matrix confusion M_confusion
fp = M_confusion[1, 0]

### Sensibilité du modèle

In [105]:
sensibilite = vp / (vp + fn)

sensibilite

0.8053097345132744

### Précision du modèle

In [109]:
precision = vp / (vp + fp)

precision

0.7913043478260869