## Decision Tree Implementation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
# Function importing Dataset
def import_data():
    dataset = pd.read_csv("heart.csv")
    print(dataset.shape)
    print(dataset.head())
    return dataset

In [None]:
dataset = import_data()

(1025, 14)
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [None]:
# age: The person’s age in years
# sex: The person’s sex (1 = male, 0 = female)
# cp: chest pain type
# — Value 0: asymptomatic
# — Value 1: atypical angina
# — Value 2: non-anginal pain
# — Value 3: typical angina
# trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)
# chol: The person’s cholesterol measurement in mg/dl
# fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
# restecg: resting electrocardiographic results
# — Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# — Value 1: normal
# — Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# thalach: The person’s maximum heart rate achieved
# exang: Exercise induced angina (1 = yes; 0 = no)
# oldpeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)
# slope: the slope of the peak exercise ST segment — 0: downsloping; 1: flat; 2: upsloping
# 0: downsloping; 1: flat; 2: upsloping
# ca: The number of major vessels (0–3)
# thal: A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously
# Value 1: fixed defect (no blood flow in some part of the heart)
# Value 2: normal blood flow
# Value 3: reversible defect (a blood flow is observed but it is not normal)
# target: Heart disease (1 = no, 0= yes)

In [None]:
dataset.info

<bound method DataFrame.info of       age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  target  
0         2

In [None]:
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:
dataset.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
dataset.value_counts()

age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal  target
38   1    2   138       175   0    1        173      0      0.0      2      4   2     1         8
61   1    0   148       203   0    1        161      0      0.0      2      1   3     0         4
              138       166   0    0        125      1      3.6      1      1   2     0         4
52   1    0   112       230   0    1        160      0      0.0      2      1   2     0         4
              125       212   0    1        168      0      1.0      2      2   3     0         4
                                                                                               ..
53   0    0   130       264   0    0        143      0      0.4      1      0   2     1         3
              138       234   0    0        160      0      0.0      2      0   2     1         3
          2   128       216   0    0        115      0      0.0      2      0   0     1         3
     1    0   142       2

In [None]:
dataset['cp'].value_counts() #we can use more than 1 ....

0    497
2    284
1    167
3     77
Name: cp, dtype: int64

In [None]:
#split the data 
def split_dataset(dataset):
    y = dataset['target']
    X = dataset.drop(['target'],axis =1)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    return  X_train,X_test,y_train,y_test

In [None]:
X_train,X_test,y_train,y_test = split_dataset(dataset)

In [None]:
# Function to perform training with giniIndex.
def ginIndex(X_train,y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",random_state = 42,max_depth=3, min_samples_leaf=5)
  
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

In [None]:
temp = ginIndex(X_train,y_train)
temp

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=42)

In [None]:
# Function to perform training with entropy.
def entropy(X_train, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 42, max_depth = 3, min_samples_leaf = 5)
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

In [None]:
# Function to make predictions
def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values: {}".format(y_pred))
    return y_pred

In [None]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
      
    print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))
      
    print ("Accuracy : ",accuracy_score(y_test,y_pred)*100)
      
    print("Report : ",classification_report(y_test, y_pred))

In [None]:
print("Results Using Gini Index:")
clf_gini = ginIndex(X_train,y_train)
clf_entropy = entropy(X_train, y_train)      
# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
print(cal_accuracy(y_test, y_pred_gini))
      
print("Results Using Entropy:")
# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
print(cal_accuracy(y_test, y_pred_entropy))
      

Results Using Gini Index:
Predicted values: [1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 0 1 1 1 1 1 0
 1 1 1 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 1
 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 0 1 1
 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1
 1 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0 0 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 0
 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1]
Confusion Matrix:  [[69 33]
 [12 91]]
Accuracy :  78.04878048780488
Report :                precision    recall  f1-score   support

           0       0.85      0.68      0.75       102
           1       0.73      0.88      0.80       103

    accuracy                           0.78       205
   macro avg       0.79      0.78      0.78       205
weighted avg       0.79      0.78      0.78       205

None
Results Using Entropy:
Predicted values: [1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 0
 1 1 1 0 0 0 