In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Decision trees

In [None]:
import math
import pandas as pd
import numpy as np
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn
plt.rcParams['figure.figsize'] = 15, 10

# 1. Weather dataset (typical example of decision tree usage)

In [None]:
# data = pd.read_csv('data/tenis.csv')
data = pd.read_csv('data/weather.csv')
data

In [None]:
from sklearn.tree import DecisionTreeClassifier
cls = DecisionTreeClassifier(criterion='entropy')

# X = data.loc[data.index < 14, ['Outlook', 'Humidity', 'Wind']]
X = data.loc[data.index < 14, ['Outlook', 'Temperature', 'Humidity', 'Wind']]
y = data.loc[data.index < 14, 'Play']
cls.fit(X, y)

### The previous code is written intentionally :D Scikit-learn can only work with numerical values

In [None]:
# One-hot encoding - encoding of all data except the predicted column
encoded = pd.concat([pd.get_dummies(data[column], prefix=column) for column in set(data.columns) - {'Play'}], axis=1)
encoded

### get_dummies is not a good encoding method!! We can try other, better encoding methods
https://github.com/FIIT-IAU/IAU-course/blob/main/exercises/week-06/IAU_061_outlier_missing_transform.ipynb

## 1.1 We will train the classifier on the training data

(the last line will be the test data)

In [None]:
X = encoded[encoded.index < 14]
y = data.loc[data.index < 14, 'Play']

cls.fit(X, y)

In [None]:
test = encoded[encoded.index == 14]
cls.predict(test)

## 1.2 We can visualize the trained tree (classifier)

In [None]:
from sklearn.tree import export_graphviz
from graphviz import Source
from IPython.display import SVG

graph = Source(export_graphviz(cls, 
                               out_file=None,
                               feature_names=encoded.columns,
                               class_names=['no', 'yes'],
                               filled = True))

display(SVG(graph.pipe(format='svg')))

from IPython.display import HTML # toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
style = "<style>svg{width:100% !important;height:70% !important;}</style>"
HTML(style)

In [None]:
tree.plot_tree(cls)

# 2. Breast cancer dataset (overfitting example)

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

label_names = data['target_names']
labels = data['target']

feature_names = data['feature_names']
features = data['data']

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# We split the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=4)

print('# train data: ', len(X_train))
print('# test data: ', len(X_test))

## 2.1 We train several trees
**where we limit each one's maximum depth to create trees of varying complexity.**

In [None]:
results = []
estimators = []
for i in range(1, X_train.shape[1] + 1):  
    row = {'model_complexity': i}
    
    # We will create a decision tree
    # a tree with a maximum depth of 1-number of attributes, thus simulating the complexity of the model
    clf = DecisionTreeClassifier(max_depth = i) 
    
    # model training and prediction on the training set
    pred = clf.fit(X_train, y_train).predict(X_train) 
    
    # training set error
    row['train'] = 1-accuracy_score(y_train, pred) 
    
    # prediction
    pred = clf.predict(X_test)
    
    # error on test suite
    row['test'] = 1-accuracy_score(y_test, pred) 
    results.append(row)
    estimators.append(clf)

In [None]:
complexity_df = pd.DataFrame(results)
complexity_df.head()

In [None]:
complexity_df.plot(x='model_complexity')

As the complexity of the model increases, the error on the training set does not decrease. It remains the same, and on the test set, it often even increases. This is an indicator that we have overfitted the model. It has learned the data rather than the underlying relationships. The model poorly generalizes the patterns in the data. When we attempt to make predictions on other data, we encounter a large error caused by variance.

## 2.2 We can try to visualize different trained models

In [None]:
# only one attribute used for decision
graph = Source(export_graphviz(estimators[0], 
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# this is just to make the image fit on my screen
from IPython.display import HTML 
style = "<style>svg{width:100% !important;height:100% !important;}</style>"
HTML(style)

In [None]:
# model for which the error on the validation set started to increase
graph = Source(export_graphviz(estimators[4], 
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# this is just to make the image fit on my screen
from IPython.display import HTML 
style = "<style>svg{width:50% !important;height:50% !important;}</style>"
HTML(style)

In [None]:
graph = Source(export_graphviz(estimators[-1], # najzlozitejsi model
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# this is just to make the image fit on my screen
from IPython.display import HTML 
style = "<style>svg{width:45% !important;height:45% !important;}</style>"
HTML(style)