### AccelerateAI - Tree based models for Classification and Regression

In this notebook we will look at various ways of pruning a decision tree. 

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score,confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_excel("04_heart_disease.xlsx", sheet_name=1)
df.head()

Unnamed: 0,age,sex,chest_pain_type,BP,cholestrol,bloodsugarlevel,ECG_result,Max_heart_rate,Angina,oldpeak,slopepeak,major_vessels,thal,disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0


In [5]:
X = df.drop(columns=['disease'])
y = df['disease']
print(X.shape)
print(y.shape)

(270, 13)
(270,)


In [6]:
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(x_train.shape)
print(x_test.shape)

(202, 13)
(68, 13)


### 1. Fit a full grown decision tree

In [19]:
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(x_train,y_train)

y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test) 

print(f'Train score: {accuracy_score(y_train_pred,y_train)}')
print(f'Test  score: {accuracy_score(y_test_pred,y_test)}')

Train score: 1.0
Test  score: 0.7352941176470589


### 2. Pre-prunning - with hyperparameters

In [22]:
# set hyper-paramteres
params = {'max_depth': [2,4,6,8,10,12],
         'min_samples_split': [2,3,4,5],
         'min_samples_leaf': [1,2,5]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf,param_grid=params)
gcv.fit(x_train,y_train)

model = gcv.best_estimator_
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

print(f'Best params: {gcv.best_params_}')
print(f'Train score: {accuracy_score(y_train_pred,y_train)}')
print(f'Test  score: {accuracy_score(y_test_pred,y_test)}')

Best params: {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 3}
Train score: 0.8811881188118812
Test  score: 0.8382352941176471


### 3.Post pruning 

In [24]:
# cost complexity pruning

clf_p = tree.DecisionTreeClassifier(random_state=0,ccp_alpha=0.020)
clf_p.fit(x_train,y_train)
y_train_pred = clf_p.predict(x_train)
y_test_pred = clf_p.predict(x_test)

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')

Train score 0.8613861386138614
Test score 0.8676470588235294


Grid search can be run to find the right value of ccp_alpha !