In [None]:
import sqlite3
import warnings

import matplotlib.pyplot as plt
import pandas as pd
from category_encoders import OrdinalEncoder
from IPython.display import VimeoVideo
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)

## few terms to know
1. depth = no. of splits that we need to make to get to the pure node
2. purity of node is defined by gini coefficient
3. We nee to divide data into train and test split and then divide train data into training data and validation set
4. Parametric Model: the type of equition for the model is fixed before even the model training starts like for linear model we already have how the equation of relationship between X and y will look like. We only estimate the already defined equation
5. Decision tree is a non-parametric method of estimation

## Why ordinal encoding works for Decision Tree
Decision tree does not care about the distance between two categories but to which split gives more purity to the node

simple decision: 
                 
                 decision tree --- ordinal encoder

                 linear/logistic regression --- onehot encoding


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size = 0.2, random_state = 42
)

In [None]:
# Build Model
model = make_pipeline( OrdinalEncoder(), DecisionTreeClassifier(random_state =42))
# Fit model to training data
model.fit(X_train, y_train)

In [None]:
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_val = model.score(X_val, y_val)

print("Training Accuracy:", round(acc_train, 2))
print("Validation Accuracy:", round(acc_val, 2))

In [None]:
############## hyperparameter tuning ##############
# models usually are not build it first step that they can be generalized and depth of the tree is
# such a way it get max of purity in node, we will need to trip the tree to avoid such case
tree_depth = model.named_steps["decisiontreeclassifier"].get_depth()
print("Tree Depth:", tree_depth)

depth_hyperparams = range(1,50,2)

In [None]:
# Create empty lists for training and validation accuracy scores
training_acc = []
validation_acc = []

for d in depth_hyperparams:
    # Create model with `max_depth` of `d`
    test_model = 
    # Fit model to training data
    test_model.fit(X_train, y_train)
    # Calculate training accuracy score and append to `training_acc`
    training_acc.append(test_model.score(X_train, y_train))
    # Calculate validation accuracy score and append to `training_acc`
    validation_acc.append(test_model.score(X_val, y_val))

print("Training Accuracy Scores:", training_acc[:3])
print("Validation Accuracy Scores:", validation_acc[:3])

In [None]:
## finding best max_depth parameter
# Plot `depth_hyperparams`, `training_acc`
plt.plot(depth_hyperparams , training_acc, label = "training")
plt.plot(depth_hyperparams , validation_acc, label = "validation")
plt.xlabel("Max Depth")
plt.ylabel("accuracy score")


In [None]:
## select best param val and retrain the model 
# here 6 is supposed selected depth
model = make_pipeline( 
        OrdinalEncoder(), 
        DecisionTreeClassifier(max_depth = 6, random_state =42))
    # Fit model to training data
model.fit(X_train, y_train)
test_acc = model.score(X_test,y_test)
print("Test Accuracy:", round(test_acc, 2))

In [None]:
# Create larger figure
fig, ax = plt.subplots(figsize=(25, 12))
# Plot tree
plot_tree(
    decision_tree= model.named_steps["decisiontreeclassifier"],
    feature_names= list(X_train.columns),
    filled=True,  # Color leaf with class
    rounded=True,  # Round leaf edges
    proportion=True,  # Display proportion of classes in leaf
    max_depth=3,  # Only display first 3 levels
    fontsize=12,  # Enlarge font
    ax=ax,  # Place in figure axis
);


In [None]:
features = list(X_train.columns)# model["onehot_encoder"].get_feature_names()
importances = model.named_steps["decisiontreeclassifier"].feature_importances_

print("Features:", features[:3])
print("Importances:", importances[:3])

In [None]:
feat_imp = pd.Series(importances, index = features).sort_values()
feat_imp.head()

In [None]:
# Create horizontal bar chart
feat_imp.plot(kind = 'barh')
plt.xlabel("Gini Importance");