In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Decission trees

URLs
- https://scikit-learn.org/0.22/auto_examples/tree/plot_cost_complexity_pruning.html 
- [Post Pruning Decision Trees](https://medium.com/swlh/post-pruning-decision-trees-using-python-b5d4bcda8e23)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## Load titanic dataset (train dataset only)

In [None]:
data = pd.read_csv('titanic/train.csv')
X, y = data[['Pclass',  
            'Sex', 
            'Age',
            'SibSp',
            'Parch',  
            'Fare',   
            'Cabin',
            'Embarked']], data['Survived']
X,y = X.fillna(0),y.fillna(0)
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

### Overfitting example

In [None]:
model = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
y_predicted = model.predict(X_test)

print('Training accuracy: ',model.score(X_train,y_train))
print('Test Accuracy: ',model.score(X_test, y_test))

## Post-prunning
- The DecisionTreeClassifier class in sklearn provides **ccp_alpha** as a parameter for post pruning. 
- The parameter ccp_alpha provides a threshold for effective alphas, i.e. the process of pruning continues until the minimal effective alpha of the pruned tree is not greater than ccp_alpha. 
- The DecisionTreeClassifier class also provides a method **cost_complexity_pruning_path** which implements **the pruning process** and returns the effective alphas (and the corresponding impurities of there pruned trees)

In [None]:
path=DecisionTreeClassifier(random_state=1).cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(path)

# plotting
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], 
        impurities[:-1], 
        marker='o', 
        drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=1,ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
    
print("Number of nodes in the last tree is: {} with ccp_alpha: {} and a depth of: {}".\
      format(clfs[-1].tree_.node_count, ccp_alphas[-1],clfs[-1].tree_.max_depth))

#### We remove the last element in **clfs** and **ccp_alphas**, because it is the trivial tree with only one node. Here we show that the number of nodes and tree depth decreases as alpha increases.

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]

# plotting
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, 
           node_counts, 
           marker='o', 
           drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")

ax[1].plot(ccp_alphas, 
           depth, 
           marker='o', 
           drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

## Accuracy vs alpha for training and testing sets

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

# plotting
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")

ax.plot(ccp_alphas, 
        train_scores, 
        marker='o', 
        label="train",
        drawstyle="steps-post")

ax.plot(ccp_alphas, 
        test_scores, 
        marker='o', 
        label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

## Use the best model

In [None]:
index_best_model = np.argmax(test_scores)
best_model = clfs[index_best_model]

print('Training accuracy of best model: ', best_model.score(X_train, y_train))
print('Test accuracy of best model: ', best_model.score(X_test, y_test))