In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# DATA PRE-PROCESSING

In [None]:
# Load the data
data = pd.read_csv('heart.csv')

categorical_features = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
quantitative_features = ['age','trtbps','chol','thalachh','oldpeak']
features = categorical_features + quantitative_features

# Pre-processing function : clean data, fill missing values and encode categorical data
def datapreprocessing(data):
               
    # Feature scaling
    for i in quantitative_features :
            scaler = StandardScaler()
            data[i] = scaler.fit_transform(data[[i]])
            
    # Encoding categorical features    
    for i in categorical_features : 
          labelencoder=LabelEncoder()
          data[i]=labelencoder.fit_transform(data[i])   
    
    Y = data.loc[:,'output']
    X = data.drop(['output'],axis=1) 
    
    return(X,Y)

In [None]:
# Pre-processing datset
datacopy = data.copy()
X, Y = datapreprocessing(datacopy) 

# Splitting traing dataset for validation testing (70% for training and 30% for validation)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1, shuffle=True)

# PART 4 : MODEL SELECTION

In [None]:
# Model selection : supervised classification methods
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
#models.append(('XGb', XGBClassifier()))

def model_comparison(models, x, y):
      
    names = []
    results = []
    
    # Cross-validation
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)  
    for name, model in models:
        cv_results = cross_val_score(model, x, y, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        
        print('%s Cross validation accuracy: %f (SD = %f)' % (name, cv_results.mean(), cv_results.std()))
            
    # Visualization     
    plt.figure(figsize=(10, 6))
    plt.boxplot(results, labels = names)
    plt.title('Models accuracy comparison')
    plt.ylabel('Model Accuracy')
    plt.show()

In [None]:
model_comparison(models, X_train, Y_train)

# PART 5 : MODEL OPTIMIZATION

In [None]:
# Define parameters for optimization using dictionaries {parameter name: parameter list}
LR_params = {'C':[0.1, 0.5, 1, 10]}
SVM_params = {'C':[0.01, 0.1, 1, 10], 'kernel':['rbf' ,'linear', 'poly', 'sigmoid']}
RF_params = {'n_estimators':[10,50,100]}

# Append list of models with parameter dictionaries
models_opt = []
models_opt.append(('LogisticRegression', LogisticRegression(), LR_params))
models_opt.append(('SVM', SVC(), SVM_params))
models_opt.append(('RandomForest',  RandomForestClassifier(), RF_params))

def model_optimization(models, x, y):
    
    names = []
    accuracy_scores = []
    f1_scores = []
    best_estimators = []
    
    # Gridsearch method for model optimization 
    for name, model, params in models:    
       
        model_grid = GridSearchCV(model, params, scoring='accuracy')
        model_grid = model_grid.fit(x, y)
        accur=model_grid.best_score_
        
        model_grid = GridSearchCV(model, params, scoring='f1')
        model_grid = model_grid.fit(x, y)
        f1=model_grid.best_score_
        
        names.append(name) 
        best_estimators.append(model_grid.best_estimator_)
        accuracy_scores.append(accur)
        f1_scores.append(f1)
    
        print("Cross Validation %s : Accuracy = %f / F1score = %f" % (name, accur, f1))
        
    # Scores bar plot
    x = np.arange(len(names))
    width = 0.1 
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.bar(x - width, accuracy_scores, 2*width, label='accuracy')
    ax.bar(x + width, f1_scores, 2*width, label='f1')
    ax.set_ylabel('Scores')
    ax.set_title('Algorithms performance (Training set)')
    ax.set_xticks(x)
    ax.set_xticklabels(names)
    ax.legend(loc ='lower right')
    
    return(best_estimators)

In [None]:
best_estimators = model_optimization(models_opt, X_train, Y_train)

# PART 6 : TESTING

In [None]:
def modeltesting(models):
    
    test_accuracy_scores = []
    test_f1_scores = []
    test_precision_scores = []
    
    fig, axs = plt.subplots(1,3, figsize=(30,10))
    
    for i, model in enumerate(models):
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        # Metrics
        accuracy = accuracy_score(Y_test, Y_pred)
        precision = precision_score(Y_test, Y_pred)
        f1 = f1_score(Y_test, Y_pred)  

        print('%s: accuracy %f precision %f f1 %f' % (model, accuracy, precision, f1))
        
        confusion = confusion_matrix(Y_test, Y_pred)
        ConfusionMatrixDisplay(confusion).plot(ax=axs[i])
        axs[i].set_title('%s: Confusion matrix' % (model))   
        
        test_accuracy_scores.append(accuracy)
        test_f1_scores.append(f1)
        test_precision_scores.append(precision)
            
    return(test_accuracy_scores, test_f1_scores, test_precision_scores)

In [None]:
test_acc, test_f1, test_precision = modeltesting(best_estimators)

In [None]:
# Scores bar plot
labels = ['Logistic Regression', 'SVM', 'RandomForest']
x = np.arange(len(labels))
width = 0.2
fig, ax = plt.subplots(figsize=(10, 8))
ax.bar(x - width, test_acc, width, label='accuracy')
ax.bar(x, test_f1, width, label='f1')
ax.bar(x + width, test_f1, width, label='precision')
ax.set_ylabel('Scores')
ax.set_title('Algorithms performance (Testing set)')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(loc ='lower right')
fig.tight_layout()