In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Decission trees

In [None]:
import math
import pandas as pd
import numpy as np
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn
plt.rcParams['figure.figsize'] = 15, 10

# 1. Weather dataset (typical example of decision tree usage)

In [None]:
# data = pd.read_csv('data/tenis.csv')
data = pd.read_csv('data/weather.csv')
data

In [None]:
from sklearn.tree import DecisionTreeClassifier
cls = DecisionTreeClassifier(criterion='entropy')

# X = data.loc[data.index < 14, ['Outlook', 'Humidity', 'Wind']]
X = data.loc[data.index < 14, ['Outlook', 'Temperature', 'Humidity', 'Wind']]
y = data.loc[data.index < 14, 'Play']
cls.fit(X, y)

### The previous code is written intentionally :D       Scikit-learn vie pracovať len s numerickými hodnotami

In [None]:
# One-hot encoding - kodovanie vsetkych dat okrem predikovaneho stlpcu
encoded = pd.concat([pd.get_dummies(data[column], prefix=column) for column in set(data.columns) - {'Play'}], axis=1)
encoded

### get_dummies is no good encoding method !!! We can try other better encoding ones
https://github.com/FIIT-IAU/IAU-course/blob/main/exercises/week-06/IAU_061_outlier_missing_transform.ipynb

## 1.1 Natrénujem klasifikátor na trénovacích dátach

(posledný riadok bude testovacie dáta)

In [None]:
X = encoded[encoded.index < 14]
y = data.loc[data.index < 14, 'Play']

cls.fit(X, y)

In [None]:
test = encoded[encoded.index == 14]
cls.predict(test)

## 1.2 Natrénovaný strom (klasifikátor) si môžem vizualizovať

In [None]:
from sklearn.tree import export_graphviz
from graphviz import Source
from IPython.display import SVG

graph = Source(export_graphviz(cls, 
                               out_file=None,
                               feature_names=encoded.columns,
                               class_names=['no', 'yes'],
                               filled = True))

display(SVG(graph.pipe(format='svg')))

from IPython.display import HTML # toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
style = "<style>svg{width:100% !important;height:70% !important;}</style>"
HTML(style)

In [None]:
tree.plot_tree(cls)

# 2. Breast cancer dataset (overfitting example)

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

label_names = data['target_names']
labels = data['target']

feature_names = data['feature_names']
features = data['data']

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Rozdelíme údaje
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=4)

print('# train data: ', len(X_train))
print('# test data: ', len(X_test))

## 2.1 Natrénujem si viacero stromov
**kde každému obmedzím jeho maximálnu hĺbku aby som vytvoril stromy s rôznou zložitosťou.**

In [None]:
results = []
estimators = []
for i in range(1, X_train.shape[1] + 1):  
    row = {'model_complexity': i}
    
    # Vytvoríme rozhodovací strom
    # strom s maximalnou hlbkou 1-pocet atributov,  simulujeme tak zlozitost modelu
    clf = DecisionTreeClassifier(max_depth = i) 
    
    # natrenovanie modelu a predikovanie na trenovacej sade
    pred = clf.fit(X_train, y_train).predict(X_train) 
    
    # chyba na trenovacej sade
    row['train'] = 1-accuracy_score(y_train, pred) 
    
    # predickcia
    pred = clf.predict(X_test)
    
    # chyba na testovacej sade
    row['test'] = 1-accuracy_score(y_test, pred) 
    results.append(row)
    estimators.append(clf)

In [None]:
complexity_df = pd.DataFrame(results)
complexity_df.head()

In [None]:
complexity_df.plot(x='model_complexity')

S rastúcou zložitosťou modelu sa mi nijak nezmenšuje chyba na trénovanej vzorke. Na testovacej tiež nie. Väčšinou sa dokonca zväčšuje. Toto je indikátor toho, že sme ten model preučili. Naučil sa dáta a nie vzťahy za nimi. Model zle zovšeobecňuje / generalizuje vzory v dátach. Ak skúšame predikciu na iných dátach, tak narazíme na veľkú chybu spôsobenú varianciou.

## 2.2 Môžeme si skúsiť vizualizovať rôzne natrénované modely

In [None]:
# len jeden atribut pouzity na rozhodnutie
graph = Source(export_graphviz(estimators[0], 
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
from IPython.display import HTML 
style = "<style>svg{width:100% !important;height:100% !important;}</style>"
HTML(style)

In [None]:
# model, za ktorym zacala rast chyba na validacnej vzorke
graph = Source(export_graphviz(estimators[4], 
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
from IPython.display import HTML 
style = "<style>svg{width:50% !important;height:50% !important;}</style>"
HTML(style)

In [None]:
graph = Source(export_graphviz(estimators[-1], # najzlozitejsi model
                               out_file=None,
                               feature_names=feature_names,
                               class_names=label_names,
                               filled = True))

display(SVG(graph.pipe(format='svg')))

# toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
from IPython.display import HTML 
style = "<style>svg{width:45% !important;height:45% !important;}</style>"
HTML(style)