In [None]:
!pip install ucimlrepo

# Week 5 - Model Evaluation Hands On

Adapted from https://www.kaggle.com/code/gauravduttakiit/hyperparameter-tuning-in-decision-trees

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

In [None]:
X

In [None]:
y

For simplicity, we will make label equals to 1 (heart disease) if it is bigger than 0.

In [None]:
y = y.applymap(lambda x: 1 if x > 0 else 0)
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_train.shape, X_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,
                   feature_names=X.columns,
                   class_names=['No Disease', "Disease"],
                   filled=True)

## Initial evaluation the model performance

In [None]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

## Training Accuracy

In [None]:
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

## Testing Accuracy

In [None]:
print(accuracy_score(y_test, y_test_pred))
confusion_matrix(y_test, y_test_pred)

# Helper function for tree plotting and performance evaluation

In [None]:
def get_dt_graph(dt_classifier):
    fig = plt.figure(figsize=(25,20))
    _ = tree.plot_tree(dt_classifier,
                       feature_names=X.columns,
                       class_names=['No Disease', "Disease"],
                       filled=True)

In [None]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

# Training without setting any hyperparameter

In [None]:
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_default)

In [None]:
evaluate_model(dt_default)

Overfit as we see the test accuracy is lower than the train accuracy

# Controlling the Depth

In [None]:
dt_depth = DecisionTreeClassifier(max_depth=3)
dt_depth.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_depth)

In [None]:
evaluate_model(dt_depth)

# Specifying Minimum Samples Before Splitting

In [None]:
dt_min_split = DecisionTreeClassifier(min_samples_split=20)
dt_min_split.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_split)

In [None]:
evaluate_model(dt_min_split)

# Specifying Minimum Samples in Leaf Node

In [None]:
dt_min_leaf = DecisionTreeClassifier(min_samples_leaf=20, random_state=42)
dt_min_leaf.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf)

In [None]:
evaluate_model(dt_min_leaf)

# Changing Evaluation Metric

In [None]:
dt_min_leaf_entropy = DecisionTreeClassifier(min_samples_leaf=20, random_state=42, criterion="entropy")
dt_min_leaf_entropy.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf_entropy)

In [None]:
evaluate_model(dt_min_leaf_entropy)

# How To Make it More Seamless
Hyperparameter Tuning

In [None]:
dt = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# params selection
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [None]:
grid_search = GridSearchCV(estimator=dt,
                           param_grid=params,
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
dt_best = grid_search.best_estimator_

In [None]:
evaluate_model(dt_best)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, dt_best.predict(X_test)))