In [3]:
# Importing the libraries
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

#ensemble models for better performance
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score

### Importing Custom Class to perform Classification

In [4]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, 'W:\Github\Personal Projects\statistical-model-implementer')

import MiscellaneousFunctions
import MultivariateClassification

### Importing the dataset

In [5]:
df = pd.read_csv('Datasets/HeartDiseaseData.csv')

In [6]:
df.head()

Unnamed: 0,Age,Sex,Chest Pain,trestbps,Cholesterol,fbs,RestECG,MaxHR (thalach),ExAng,Oldpeak,Slope,Cardiac Arrest,thal,Scale 0,Scale 1,Scale 2,Scale 3,Scale 4
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,1,0,0,0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,0,0,1,0,0
2,37,1,3,130,250,0,0,187,0,3.5,3,0,3,1,0,0,0,0
3,41,0,2,130,204,0,2,172,0,1.4,1,0,3,1,0,0,0,0
4,56,1,2,120,236,0,0,178,0,0.8,1,0,3,1,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              297 non-null    int64  
 1   Sex              297 non-null    int64  
 2   Chest Pain       297 non-null    int64  
 3   trestbps         297 non-null    int64  
 4   Cholesterol      297 non-null    int64  
 5   fbs              297 non-null    int64  
 6   RestECG          297 non-null    int64  
 7   MaxHR (thalach)  297 non-null    int64  
 8   ExAng            297 non-null    int64  
 9   Oldpeak          297 non-null    float64
 10  Slope            297 non-null    int64  
 11  Cardiac Arrest   297 non-null    int64  
 12  thal             297 non-null    int64  
 13  Scale 0          297 non-null    int64  
 14  Scale 1          297 non-null    int64  
 15  Scale 2          297 non-null    int64  
 16  Scale 3          297 non-null    int64  
 17  Scale 4         

In [8]:
df.describe()

Unnamed: 0,Age,Sex,Chest Pain,trestbps,Cholesterol,fbs,RestECG,MaxHR (thalach),ExAng,Oldpeak,Slope,Cardiac Arrest,thal,Scale 0,Scale 1,Scale 2,Scale 3,Scale 4
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,3.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,1.602694,0.676768,4.73064,0.538721,0.181818,0.117845,0.117845,0.043771
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,1.938629,0.49934,0.386346,0.322969,0.322969,0.204931
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,3.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
75%,61.0,1.0,4.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0,0.0,0.0,0.0,0.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0,1.0,1.0,1.0,1.0


### Converting one-hot-encoded data to a single column
We see that our target is one hot encoded as we used it for neural networks. We need to get the target into 1 column to apply other machine learning algorithms.

In [81]:
val = []
for i in range(y.shape[0]):
    scale = np.where(y[i] == 1)
    val.append(scale[0][0])
    
target = pd.Series(val)
target.head()

0    0
1    2
2    0
3    0
4    0
dtype: int64

In [9]:
cols = df.columns

In [83]:
df['Target'] = target
df.head()

Unnamed: 0,Age,Sex,Chest Pain,trestbps,Cholesterol,fbs,RestECG,MaxHR (thalach),ExAng,Oldpeak,Slope,Cardiac Arrest,thal,Scale 0,Scale 1,Scale 2,Scale 3,Scale 4,Target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,1,0,0,0,0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,0,0,1,0,0,2
2,37,1,3,130,250,0,0,187,0,3.5,3,0,3,1,0,0,0,0,0
3,41,0,2,130,204,0,2,172,0,1.4,1,0,3,1,0,0,0,0,0
4,56,1,2,120,236,0,0,178,0,0.8,1,0,3,1,0,0,0,0,0


In [84]:
categorical_cols = df.select_dtypes(['object'])
categorical_cols = categorical_cols.columns

In [85]:
numeric_columns = df.select_dtypes(['int64', 'float64'])
numeric_columns = numeric_columns.columns

### Getting the features and the target

In [86]:
X = df.iloc[:, 0:13].values #index of columns in the independent (predictor) variables
y = df.iloc[:, 18].values

### Scaling

In [87]:
# Feature Scaling - MUST scale for any NN model
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(X)

### Train Test splitting

In [88]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [90]:
print('x_train {}, x_test {}, y_train {}, y_test {}'.format(x_train.shape, x_test.shape, y_train.shape, y_test.shape))

x_train (222, 13), x_test (75, 13), y_train (222,), y_test (75,)


In [135]:
support_vector_machine = SVC()
decision_tree_classifier = DecisionTreeClassifier()
random_forest_classifier = RandomForestClassifier()
adaboost_classifer = AdaBoostClassifier()

all_models = [support_vector_machine, decision_tree_classifier, 
                           random_forest_classifier, adaboost_classifer]
all_model_names = ['Support Vector Classifier', 'Decision Tree Classifier', 
                           'Random Forest Classifier', 'Adaboost Classifier']

In [141]:
def fit_validate(x_train, x_test, y_train, y_test):
        '''
        fits models to data and stores results for metrics
        '''
        for model in all_models:
            model.fit(x_train, y_train)
        
        for name, model in zip(all_model_names, all_models):
            print(name, '\n')
            print(confusion_matrix(y_test, model.predict(x_test), labels=[0, 1, 2, 3, 4]))
            print()
            # Printing the precision and recall, among other metrics
            print(classification_report(y_test, model.predict(x_test), labels=[0, 1, 2, 3, 4]))
            print('\n\n')
            
            
fit_validate(x_train, x_test, y_train, y_test)

Support Vector Classifier 

[[35  0  0  1  0]
 [10  2  2  3  0]
 [ 4  2  1  3  0]
 [ 3  1  2  2  0]
 [ 1  1  1  1  0]]

              precision    recall  f1-score   support

           0       0.66      0.97      0.79        36
           1       0.33      0.12      0.17        17
           2       0.17      0.10      0.12        10
           3       0.20      0.25      0.22         8
           4       0.00      0.00      0.00         4

    accuracy                           0.53        75
   macro avg       0.27      0.29      0.26        75
weighted avg       0.44      0.53      0.46        75




Decision Tree Classifier 

[[30  4  1  1  0]
 [ 8  4  0  4  1]
 [ 4  1  3  1  1]
 [ 2  0  3  2  1]
 [ 0  1  1  1  1]]

              precision    recall  f1-score   support

           0       0.68      0.83      0.75        36
           1       0.40      0.24      0.30        17
           2       0.38      0.30      0.33        10
           3       0.22      0.25      0.24         


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



We see that none of our models managed to outperform the neural network. We will stick to that.