### **LIBRARY IMPORT**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, random

from sklearn import model_selection                                  
from sklearn.model_selection import train_test_split      
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler      # Scaling
from sklearn.impute import SimpleImputer

# getting methods for confusion matrix, F1 score, Accuracy Score
from sklearn import metrics                                          
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score,classification_report,roc_curve,auc,average_precision_score

from sklearn.linear_model import LogisticRegression     # For logistic Regression
from sklearn.naive_bayes import GaussianNB              # For Naive Bayes classifier
from sklearn.neighbors import KNeighborsClassifier      # For K-NN Classifier
from sklearn.svm import SVC                             # For support vector machine based classifier
from sklearn.tree import DecisionTreeClassifier         # For Decision tree

# for ensemble
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#from xgboost import XGBClassifier

### **FUNCTION DEFINITIONS**

#### **CONFUSION MATRIX PLOT**

In [None]:
def plot_confusion_matrix(y_test, yhat):
    cm = confusion_matrix(y_test, yhat)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g', cmap=plt.cm.Blues, cbar=False);
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix', size=8); 
    ax.xaxis.set_ticklabels(['Healthy', 'Diseased']); ax.yaxis.set_ticklabels(['Healthy', 'Diseased'])


#### **LOGISTIC REGRESSION MODEL**

In [None]:

# Logistic Regression
def LoR(X_tr,Y_tr,X_te,Y_te):
  lr = LogisticRegression(random_state=42)
  lr.fit(X_tr,Y_tr)
  lr_y_pred = lr.predict(X_te)

  # Confusion Matrix for the Logistic Regression Model
  plot_confusion_matrix(Y_te,lr_y_pred)

  # Classification Report for the Logistic Regression Model
  print("Classification Report : Logistic Regression")
  classRep = classification_report(Y_te, lr_y_pred, digits=2)
  print(classRep)


#### **KNN MODEL**

In [None]:
# KNN
def KNN(X_train,Y_train,X_test,Y_test):
  # creating odd list of K for KNN
  kvalue = list(range(1,40,2))

  # creating empty list for accuracy odd different value of K
  acc = []

  # perform accuracy metrics for values from different k values
  for k in kvalue:
      knn = KNeighborsClassifier(n_neighbors=k)
      knn.fit(X_train, Y_train)
      # predict 
      y_pred = knn.predict(X_test)
      # evaluate accuracy
      accuracy = accuracy_score(Y_test, y_pred)
      acc.append(accuracy)

  # determining best k
  bestk = kvalue[acc.index(max(acc))]
  print("The optimal number of neighbors is %d" % bestk)
  plt.plot(kvalue,acc)

  # instantiate learning model (here k = 1)
  knn = KNeighborsClassifier(n_neighbors = bestk, weights = 'uniform', metric='euclidean')

  # fitting the model
  knn.fit(X_train, Y_train)

  # predict the response
  knn_y_pred = knn.predict(X_test)

  # Confusion Matrix for the K-nearest neighbors Model
  plot_confusion_matrix(Y_test,knn_y_pred)

  # Classification Report for the K-nearest neighbors Model
  print("Classification Report : K-nearest neighbors")
  classRep = classification_report(Y_test, knn_y_pred, digits=2)
  print(classRep)


#### **NAIVE BAYES MODEL**

In [None]:
# Naive Bayes
def NB(X_tr,Y_tr,X_te,Y_te):
  # naive bayes
  nb = GaussianNB()
  nb.fit(X_tr , Y_tr)

  # predict the response
  nb_y_pred = nb.predict(X_te)

  # Confusion Matrix for the Naive Bayes
  plot_confusion_matrix(Y_te,nb_y_pred)

  # Classification Report for the Naive Bayes Model
  print("Classification Report : Naive Bayes")
  classRep = classification_report(Y_te, nb_y_pred, digits=2)
  print(classRep)


#### **DECISION TREE MODEL**

In [None]:
# Decision Tree
def DT(X_tr,Y_tr,X_te,Y_te):
  # Decison Tree
  dt = DecisionTreeClassifier(random_state=42)
  dt.fit(X_tr , Y_tr)

  # predict the response
  dt_y_pred = dt.predict(X_te)

  # Confusion Matrix for the Decision Tree
  plot_confusion_matrix(Y_te,dt_y_pred)

  # Classification Report for the Decision Tree Model
  print("Classification Report : Decision Tree")
  classRep = classification_report(Y_te, dt_y_pred, digits=2)
  print(classRep)

#### **SVM MODEL**

In [None]:
# SVM
def svm(X_tr,Y_tr,X_te,Y_te):
  # Normally, C = 1 and gamma = 'scale' are default values
  # C controls how wide the margin will be with respect to how many misclassification we are allowing
  # C is increasing --> reduce the size of the margin and fewer misclassification and vice versa
  param_grid = [
      {'C': [0.5, 1,  5, 10,  100],
      'gamma': ['scale', 0.5, 1, 0.1, 0.01, 0.001],
      'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
  ]

  optimal_params = GridSearchCV(SVC(),
                              param_grid,
                              cv=5, #  taking 10-fold as in k-fold cross validation
                              scoring='accuracy', 
                              verbose=0,
                              n_jobs=-1)

  optimal_params.fit(X_tr, Y_tr)
  print(optimal_params.best_params_)

  #svm
  C = optimal_params.best_params_['C']
  gamma = optimal_params.best_params_['gamma']
  kernel = optimal_params.best_params_['kernel']

  svm = SVC(C=C, gamma=gamma, kernel=kernel)
  svm.fit(X_tr,Y_tr)

  # predict the response
  svm_y_pred = svm.predict(X_te)

  # Confusion Matrix for the Support Vector Machine Model
  plot_confusion_matrix(Y_te,svm_y_pred)

  # Classification Report for the Support Vector Machine Model
  print("Classification Report : Support Vector Machine")
  classRep = classification_report(Y_te, svm_y_pred, digits=2)
  print(classRep)

#### **ENSEMBLE MODEL**

In [None]:
# ensemble stacking
def Ensemble(X_tr,Y_tr,X_te,Y_te):
  level0 = list()
  level0.append(('lr', LogisticRegression(random_state=42)))
  level0.append(('knn', KNeighborsClassifier(n_neighbors = 1, weights = 'uniform', metric='euclidean')))
  level0.append(('cart', DecisionTreeClassifier(random_state=42)))
  level0.append(('svm', SVC(C= 30, gamma= 0.125, kernel= 'rbf')))
  level0.append(('bayes', GaussianNB()))

  # define meta learner model
  level1 = LogisticRegression(random_state=42)

  # define the stacking ensemble with cross validation of 5
  Stack_model = StackingClassifier(estimators=level0, final_estimator=level1)

  # predict the response
  Stack_model.fit(X_tr, Y_tr)
  prediction_Stack = Stack_model.predict(X_te)

  # Confusion Matrix for the Stacking Model
  plot_confusion_matrix(Y_te,prediction_Stack)

  # Classification Report for the Stacking Model
  print("Classification Report : Stacking")
  print(classification_report(Y_te, prediction_Stack, digits=2))

#### **RANDOM FOREST MODEL**

In [None]:
def RF(X_tr,Y_tr,X_te,Y_te):
  
  # Create the param grid
  param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],# Number of trees in random forest
                'max_depth': range(1,10),# Maximum number of levels in tree
                'criterion':['gini','entropy'] }# measure the quality of a split

  optimal_params = GridSearchCV(RandomForestClassifier(),
                              param_grid,
                              cv=10, # we are taking 10-fold as in k-fold cross validation
                              scoring='accuracy', 
                              verbose=0,
                              n_jobs=-1)

  optimal_params.fit(X_tr, Y_tr)
  print(optimal_params.best_params_)

  criterion = optimal_params.best_params_['criterion']
  max_depth = optimal_params.best_params_['max_depth']
  n_estimators = optimal_params.best_params_['n_estimators']

  #creating model of Random Forest
  RandomForest = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, criterion=criterion,random_state=42)
  RandomForest = RandomForest.fit(X_tr, Y_tr)

  # predict the response
  RandomForest_pred = RandomForest.predict(X_te)

  # Confusion Matrix for the Random Forest Model
  plot_confusion_matrix(Y_te,RandomForest_pred)

  # Classification Report for the Randome Forest Model
  print("Classification Report : Random Forest")
  print(classification_report(Y_te, RandomForest_pred, digits=2))  

#### **ADABOOST MODEL**

In [None]:
def Adaboost(X_tr,Y_tr,X_te,Y_te):
  param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]}
  optimal_params = GridSearchCV(AdaBoostClassifier(),
                              param_grid,
                              cv=10, # we are taking 10-fold as in k-fold cross validation
                              scoring='accuracy', 
                              verbose=0,
                              n_jobs=-1)

  optimal_params.fit(X_tr, Y_tr)
  print(optimal_params.best_params_)
  n_estimators = optimal_params.best_params_['n_estimators']
  
  #creating model of Adaptive Boosting
  AdBs = AdaBoostClassifier( n_estimators= n_estimators)
  AdBs  = AdBs.fit(X_tr, Y_tr)

  # predict the response
  AdBs_y_pred = AdBs.predict(X_te)

  # Confusion Matrix for the Adaptive Boosting Model
  plot_confusion_matrix(Y_te,AdBs_y_pred)

  # Classification Report for the Adaptive Boosting Model
  print("Classification Report : Adaptive Boosting")
  print(classification_report(Y_te, AdBs_y_pred, digits=2))

#### **XGBOOST MODEL**

In [None]:
def xgboost(X_tr,Y_tr,X_te,Y_te):
  param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]}
  optimal_params = GridSearchCV(XGBClassifier(),
                              param_grid,
                              cv=10, # we are taking 10-fold as in k-fold cross validation
                              scoring='accuracy', 
                              verbose=0,
                              n_jobs=-1)

  optimal_params.fit(X_tr, Y_tr)
  print(optimal_params.best_params_)
  n_estimators = optimal_params.best_params_['n_estimators']
  xgBs =XGBClassifier(n_estimators= n_estimators)
  xgBs  = xgBs.fit(X_tr, Y_tr)

  # predict the response
  xgBs_y_pred = xgBs.predict(X_te)

  # Confusion Matrix for the Adaptive Boosting Model
  plot_confusion_matrix(Y_te,xgBs_y_pred)

  # Classification Report for the Adaptive Boosting Model
  print("Classification Report : XG Boosting")
  print(classification_report(Y_te, xgBs_y_pred, digits=2))

### **SPLITTING & TRAINING**

#### **LOADING THE FEATURES AND SPLITTING**

In [None]:
DATA_D = pd.read_csv('/home/senume/Project/MIS/mis-ECG_analysis_DMD/FEATURE_EXTRACTION/HODMD Paper/Features_Beat_Bundle branch block.csv')
DATA_H = pd.read_csv('/home/senume/Project/MIS/mis-ECG_analysis_DMD/FEATURE_EXTRACTION/HODMD Paper/Features_Beat_Health Control.csv')

DATA_D = DATA_D.dropna()
DATA_H = DATA_H.dropna()

DATA_D_Label_Shape = DATA_D.shape[0]
DATA_D_Label = np.ones((DATA_D_Label_Shape)).tolist()
DATA_D.insert(len(DATA_D.columns),"Label", DATA_D_Label)

DATA_H_Label_Shape = DATA_H.shape[0]
DATA_H_Label = np.zeros((DATA_H_Label_Shape)).tolist()
DATA_H.insert(len(DATA_H.columns),"Label", DATA_H_Label)

DATASET = pd.concat([DATA_H, DATA_D], ignore_index= True)
X = DATASET.drop(["name", "Label"], axis =1)
Y = DATASET["Label"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify= Y)

#### **TRAINING MODEL WITH CONFUSION MATRIX PLOT OUTPUT**

In [None]:
LoR(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
KNN(X_train, Y_train, X_test, Y_test)

In [None]:
NB(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
DT(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
Ensemble(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
RF(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
Adaboost(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)

In [None]:
xgboost(X_tr=X_train, Y_tr=Y_train, X_te=X_test, Y_te=Y_test)