# Model Training

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats

warnings.filterwarnings('ignore')
%matplotlib inline

### Loading the data

In [2]:
data = pd.read_csv("cleaned_heart_disease_data.csv")

In [3]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,63,1,3,145,233,1,0,150,0,2.3,0,0,0
1,41,1,1,135,203,0,1,132,0,0.0,1,0,0
2,57,1,0,140,192,0,1,148,0,0.4,1,0,0
3,52,1,3,118,186,0,0,190,0,0.0,1,0,0
4,57,1,0,110,201,0,1,126,1,1.5,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,53,1,0,125,0,0,1,120,0,1.5,2,2,4
904,62,1,0,166,170,0,2,120,1,3.0,1,2,4
905,56,1,2,170,0,0,0,123,1,2.5,0,1,4
906,56,1,2,144,208,1,2,105,1,1.0,0,0,4


### Spliting the dataset into independent and dependent variables

In [13]:
def dependentIndependentSplit(data, target):
    X = data.drop(target, axis= 1)
    y = data[target]
    
    return X,y
    

In [14]:
X,y = dependentIndependentSplit(data= data,
                                target= 'num')

In [15]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0
1,41,1,1,135,203,0,1,132,0,0.0,1,0
2,57,1,0,140,192,0,1,148,0,0.4,1,0
3,52,1,3,118,186,0,0,190,0,0.0,1,0
4,57,1,0,110,201,0,1,126,1,1.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
903,53,1,0,125,0,0,1,120,0,1.5,2,2
904,62,1,0,166,170,0,2,120,1,3.0,1,2
905,56,1,2,170,0,0,0,123,1,2.5,0,1
906,56,1,2,144,208,1,2,105,1,1.0,0,0


In [16]:
y

0      0
1      0
2      0
3      0
4      0
      ..
903    4
904    4
905    4
906    4
907    4
Name: num, Length: 908, dtype: int64

### Spliting the data into training and testing data set

In [7]:
from sklearn.model_selection import train_test_split

In [17]:
def trainTestSplit(X,y,size = 0.2):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=size, random_state=34)
    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = trainTestSplit(X= X,
                                                  y= y,
                                                  size= 0.2)

In [9]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal
538,65,1,0,140,306,1,1,87,1,1.5,1,1
341,42,1,2,134,240,1,1,160,0,0.0,0,2
13,53,1,2,130,197,1,0,152,0,1.2,0,1
804,48,1,0,124,274,0,0,166,0,0.5,1,2
636,38,1,0,110,196,0,1,166,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
758,41,1,0,125,0,1,1,176,0,1.6,2,1
873,74,1,3,196,216,1,1,120,0,0.4,2,2
490,61,1,0,140,207,0,0,138,1,1.9,2,2
122,41,1,2,112,250,0,1,179,0,0.0,2,1


In [10]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal
48,59,1,2,130,318,0,1,120,1,1.0,1,1
251,56,1,2,130,459,0,1,114,0,0.0,2,1
167,54,1,1,108,309,0,1,156,0,0.0,2,2
880,58,1,0,114,318,0,2,140,0,4.4,0,0
592,57,1,1,140,265,0,2,145,1,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
303,45,1,0,140,224,0,1,144,0,0.0,0,1
152,56,1,2,120,0,0,1,97,0,0.0,1,2
179,40,1,3,140,199,0,1,178,1,1.4,2,2
808,66,1,0,150,0,0,1,108,1,2.0,1,2


In [11]:
y_train

538    1
341    0
13     0
804    3
636    1
      ..
758    2
873    3
490    1
122    0
417    1
Name: num, Length: 726, dtype: int64

In [12]:
y_test

48     0
251    0
167    0
880    4
592    1
      ..
303    0
152    0
179    0
808    3
482    1
Name: num, Length: 182, dtype: int64

### Model Training

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
def modelTraining(X_train,y_train):
    clf_model = DecisionTreeClassifier(random_state=42)
    clf_model.fit(X_train, y_train)
    return clf_model

In [21]:
clf_model = modelTraining(X_train= X_train,
                          y_train= y_train)

### Model Evaluation

In [22]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score

In [27]:
def evaluate_model(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average= 'weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    try:
        roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')
    except ValueError:
        roc_auc = None 
        
    return accuracy, precision, recall, f1, roc_auc

In [28]:
accuracy, precision, recall, f1, roc_auc = evaluate_model(clf = clf_model,
                                                          X_test= X_test,
                                                          y_test= y_test)

In [29]:
accuracy

0.45604395604395603

In [30]:
precision

0.44528006709987933

In [31]:
recall

0.45604395604395603

In [32]:
f1

0.4495383940857707

In [33]:
roc_auc

np.float64(0.5914071604878682)