In [None]:
# Chargement et étude rapide du jeu de données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
import statistics

from sklearn.metrics import confusion_matrix

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_table("heart.dat", sep="\s+", header=None)
data

FileNotFoundError: ignored

Attribute Information:
------------------------
      -- 1. age       
      -- 2. sex       
      -- 3. chest pain type  (4 values)       
      -- 4. resting blood pressure  
      -- 5. serum cholestoral in mg/dl      
      -- 6. fasting blood sugar > 120 mg/dl       
      -- 7. resting electrocardiographic results  (values 0,1,2) 
      -- 8. maximum heart rate achieved  
      -- 9. exercise induced angina    
      -- 10. oldpeak = ST depression induced by exercise relative to rest   
      -- 11. the slope of the peak exercise ST segment     
      -- 12. number of major vessels (0-3) colored by flourosopy        
      -- 13.  thal: 3 = normal; 6 = fixed defect; 7 = reversable defect     
-- Absence (1), or presence (2) of heart disease

In [None]:
# Renommer les colonnes et regarder les données manquantes (il n'y en a pas)
data.columns = ['age','sex','chest_pain_type','resting_blood_pressure','serum_cholestoral','fasting_blood_sugar',
'resting_electrocardiographic_results','maximum_heart_rate_achieved','exercise_induced_angina','oldpeak',
'slope','number_of_major_vessels','thal','label']
data.isnull().sum()

In [None]:
print(data.shape)
data

In [None]:
# Répartition graphique des réponses
print(data.label.value_counts())
sns.countplot(y="label", data=data)

In [None]:
# Répartition graphique des exemples sur un tableau récapitulatif de qlq variables
plt.figure(figsize=(25,5))
plt.subplot(1,4,1)
sns.boxplot(x='label',y='age',data=data)
plt.subplot(1,4,2)
sns.boxplot(x='label',y='resting_blood_pressure',data=data)
plt.subplot(1,4,3)
sns.boxplot(x='label',y='serum_cholestoral',data=data)
plt.subplot(1,4,4)
sns.boxplot(x='label',y='maximum_heart_rate_achieved',data=data)

In [None]:
# Matrice de corrélations
data_corr = data.corr()

# masque triangulaire
mask = np.triu(np.ones_like(data_corr, dtype=bool))

# matplolib setup
f, ax = plt.subplots(figsize=(11, 9))

# matrice de corrélation avec masque trinagulaire inférieur
sns.heatmap(data_corr, mask=mask, cmap = 'mako', center=0, square=True)

In [None]:
X = data[data.columns[:-1]]
X

In [None]:
Y = data[data.columns[-1:]]
Y

## Stratégie 1 : ensemble d’apprentissage, de validation et de test. 

In [None]:
X_av, X_t, y_av, y_t = train_test_split(X, Y, test_size=0.3, random_state=42)
X_a, X_v, y_a, y_v = train_test_split(X_av, y_av, test_size=0.33, random_state=42)
X_a.shape

In [None]:
# create a scaler object
std_scaler = StandardScaler()
#Normalize
X_a = pd.DataFrame(std_scaler.fit_transform(X_a), columns=X_a.columns)
X_v = pd.DataFrame(std_scaler.fit_transform(X_v), columns=X_v.columns)
X_t = pd.DataFrame(std_scaler.fit_transform(X_t), columns=X_t.columns)
X_t

In [None]:
X_a.iloc[:,[0,3,4,7,9,11]]

In [None]:
#k = {1, 5, 10, 15, 20, 25}
def myKNeighClass1(K=1):
    model = KNeighborsClassifier(n_neighbors=K)
    model.fit(X_a, y_a)
    y_a_pred = model.predict(X_a)    
    #print('L’erreur de classification á l\'apprentissage:', round( 1 - accuracy_score(y_a, y_a_pred),2) )
    y_v_pred = model.predict(X_v)
    #print('L’erreur de classification á la validation:', round( 1 - accuracy_score(y_v, y_v_pred),2) )
    return [round( 1 - accuracy_score(y_a, y_a_pred),2), round( 1 - accuracy_score(y_v, y_v_pred),2)]

In [None]:
def compareModels(myModel, myFunction, myParams=[]):
    MLA_columns = ["Nom du modéle","Erreur de classification en apprentissage","Erreur de classification en validation"]
    MLA_compare = pd.DataFrame(columns = MLA_columns)

    row_index = 0
    for pram in myParams:
        
        errors = myFunction(pram)
        
        MLA_name = myModel+'('+str(pram)+') :'
        MLA_compare.loc[row_index, 'Nom du modéle'] = MLA_name
        
                
        MLA_compare.loc[row_index, "Erreur de classification en apprentissage"] = errors[0]
        MLA_compare.loc[row_index, "Erreur de classification en validation"] = errors[1]

        row_index = row_index + 1

    MLA_compare.sort_values(by = ["Erreur de classification en validation"], ascending = True, inplace = True)
    return MLA_compare

In [None]:
MLA_compare = compareModels(myModel= 'KNeighClass', myFunction=myKNeighClass1, myParams=[1,5,10,15,20,25])
MLA_compare

In [None]:
plt.title("MLA")
sns.barplot(x = "Erreur de classification en validation", y = "Nom du modéle", data = MLA_compare)

In [None]:
plt.title("MLA")
sns.barplot(x = "Erreur de classification en apprentissage", y = "Erreur de classification en validation", hue ="Nom du modéle", data = MLA_compare)

In [None]:
#K=10 est le meilleur paramètre, car il présente l'erreur de validation la plus faible.

In [None]:
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_a, y_a)
y_t_pred = model.predict(X_t)   
print('L’erreur de classification au test:', round( 1 - accuracy_score(y_t, y_t_pred),2) )

## Stratégie 2 : validation-croisée pour la sélection de k

In [None]:
X_av, X_t, y_av, y_t = train_test_split(X, Y, test_size=0.3, random_state=42)
#Normalisation
std_scaler = StandardScaler()
X_av = pd.DataFrame(std_scaler.fit_transform(X_av), columns=X_av.columns)
X_t = pd.DataFrame(std_scaler.fit_transform(X_t), columns=X_t.columns)
X_av.shape

In [None]:
#k = {1, 5, 10, 15, 20, 25}
def myKNeighClass2(K=1):
    model = KNeighborsClassifier(n_neighbors=K)
    accuracies = cross_val_score(model, X_av, y_av, cv=5)
    #print('L’erreur de classification est:', round( 1 - statistics.mean(accuracies),2))
    return round( 1 - statistics.mean(accuracies),2)

In [None]:
def compareModels1(myModel, myFunction, myParams=[]):
    MLA_columns = ["Nom du modéle","Erreur de classification"]
    MLA_compare = pd.DataFrame(columns = MLA_columns)

    row_index = 0
    for param in myParams:
        
        error = myFunction(param)
        
        MLA_name = myModel+'('+str(param)+') :'
        MLA_compare.loc[row_index, 'Nom du modéle'] = MLA_name
        
                
        MLA_compare.loc[row_index, "Erreur de classification"] = error

        row_index = row_index + 1

    MLA_compare.sort_values(by = ["Erreur de classification"], ascending = True, inplace = True)
    return MLA_compare

In [None]:
MLA_compare = compareModels1(myModel= 'CrossVal KNeighClass', myFunction=myKNeighClass2, myParams=[1,5,10,15,20,25])
MLA_compare

In [None]:
plt.title("MLA")
sns.barplot(x = "Erreur de classification", y = "Nom du modéle", data = MLA_compare)

In [None]:
#10, 15 restent les bons choix

In [None]:
#K=10
model = KNeighborsClassifier(n_neighbors=10)
accuracies = cross_val_score(model, X_t, y_t, cv=5)
print('L’erreur de classification K= 10 est:', round( 1 - statistics.mean(accuracies),2))

In [None]:
#K=15
model = KNeighborsClassifier(n_neighbors=15)
accuracies = cross_val_score(model, X_t, y_t, cv=5)
print('L’erreur de classification K= 15 est:', round( 1 - statistics.mean(accuracies),2))

## On a deux méthodes qui donnent l'erreur de classification en test la plus faible (0.17).    
## Donc, on a le choix entre 2 méthodes:
### +KNeighborsClassifier(15) avec cross_val (cv=5)
### +KNeighborsClassifier(10) sans cross_val

In [None]:
#exp de matrice de confusion:
tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
print(tn, fp, fn, tp)
conf_mat = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0])
conf_mat

### Matrice de confusion

#### --------- Predicted class  
#### -------------   1 ---   2
#### -Actual  1 | TN | FP |
#### --Class   2 | FN | TP | 

-- Négatif: Absence (1)
-- Positif: Presence (2) of heart disease

Accuracy = (TP+TN) / (TP+TN+FN+FP)
 
Dans notre cas, prédire une absence d’attaque cardiaque lorsqu’en réalité cette attaque cardiaque a lieu => (FN)
=>  5 fois plus couteux => FN*5 (Afin de pénaliser).

#### Notre nouvelle formule de l'accuracy: Accuracy = (TP+TN) / (TP+TN+FN*5+FP)  
    

In [None]:
#+KNeighborsClassifier(15) avec cross_val (cv=5)

In [None]:
X_av, X_t, y_av, y_t = train_test_split(X, Y, test_size=0.3, random_state=42)
#Normalisation
X_t = pd.DataFrame(std_scaler.fit_transform(X_t), columns=X_t.columns)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
model = KNeighborsClassifier(n_neighbors=15)
y_pred = cross_val_predict(model, X_t, y_t, cv=5)

tn, fp, fn, tp = confusion_matrix(y_t, y_pred).ravel()
print(tn, fp, fn, tp)
#Affichage du MC
conf_mat = confusion_matrix(y_t, y_pred)
conf_mat

In [None]:
my_accuracy = (tn + tp) / (tn + tp + fp + (fn*5))
print('L’erreur de classification KNeighborsClassifier(15) avec cross_val (cv=5) est:', round( 1 - my_accuracy,2))

In [None]:
#+KNeighborsClassifier(10) sans cross_val

In [None]:
X_a, X_t, y_a, y_t = train_test_split(X, Y, test_size=0.3, random_state=42)
# create a scaler object
std_scaler = StandardScaler()
X_a = pd.DataFrame(std_scaler.fit_transform(X_a), columns=X_a.columns)
X_t = pd.DataFrame(std_scaler.fit_transform(X_t), columns=X_t.columns)

In [None]:
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_a, y_a)
y_t_pred = model.predict(X_t) 

tn, fp, fn, tp = confusion_matrix(y_t, y_pred).ravel()
print(tn, fp, fn, tp)
#Affichage du MC
conf_mat = confusion_matrix(y_t, y_pred)
conf_mat

In [None]:
my_accuracy = (tn + tp) / (tn + tp + fp + (fn*5))
print('L’erreur de classification KNeighborsClassifier(10) sans cross_val est:', round( 1 - my_accuracy,2))