In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

heartr_disease = pd.read_csv('data/heart-disease.csv')

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)
X = heartr_disease.drop('target', axis=1) 
y = heartr_disease['target'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Ulepszanie modelu klasyfikacji
##### Pierwsze prognozy = prognozy bazowe
##### Pierwszy model = model bazowy

### 1. Z perspektywy danych:
* Czy możemy zebrać więcej danych? (ogólnie rzecz biorąc, im więcej danych, tym lepiej)
* Czy możemy poprawić nasze dane?

### 2. Z perspektywy modelu:
* Czy istnieje lepszy model, którego moglibyśmy użyć?
* Czy możemy ulepszyć obecny model?

### 3. Hiperparametry a parametry:
* Parametry = model znajduje je sam wzorce w danych.
* Hiperparametry = ustawienia modelu, które można dostosować, aby (potencjalnie) poprawić jego zdolność do znajdowania wzorców.

In [2]:
clf = RandomForestClassifier(n_estimators=100)
# pobranie hiperparametrów
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 4. Trzy sposoby dostosowania hiperparametrów:
* Ręcznie
* Losowo za pomocą RandomSearchCV
* Kompleksowo za pomocą GridSearchCV

In [3]:
# funkcja do wyświetlenia oceny
def evaluate_preds(y_true, y_preds):
    '''
    Przeprowadzenie porównania oceny, y_true (prawdziwe wyjścia) z y_pred (przewidywania) w modelu klasyfikacji.
    '''
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {'accuracy': round(accuracy, 2),
                  'precision': round(precision,2),
                  'recall': round(recall,2),
                  'f1': round(f1, 2)}
    print(f"Accurracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1: {f1:.2f}")
    return metric_dict

# Ręcznie

Parametry do rególacji w funkcji RandomForestClassifier():
* 'max_depth'
* 'max_features'
* 'min_samples_leaf'
*  'min_samples_split
* 'n_estimators'
* ...

### Trzeba podzielić dane na trzy częsci: training, validation, test.

In [4]:
np.random.seed(42)
heartr_disease_shuffled = heartr_disease.sample(frac=1) # zwraca losowo część danych, frac określa ile ma być tych danych do 0 - 1 (0-100%)
# po tej operacji w heartr_disease_shuffled jest tyle samo danych ale są wymieszanie wierszami

In [5]:
X = heartr_disease_shuffled.drop('target', axis=1) 
y = heartr_disease_shuffled['target'] 

In [6]:
# rozdzielenie danych na trzy części
# końce indeksów danych
train_split = round(0.7 * len(heartr_disease_shuffled)) # 70%
valid_split = round(train_split + 0.15 * len(heartr_disease_shuffled)) # 15%
# dane
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

### Dopasowanie drzewa i ocena

In [7]:
np.random.seed(42)
clf = RandomForestClassifier(n_estimators=6) 
clf.fit(X_train, y_train)

# prognozy bazowe dla valid
y_preds = clf.predict(X_valid)

# ocena dla valid
baseline_metrics = evaluate_preds(y_valid, y_preds)

Accurracy: 75.56%
Precision: 0.82
Recall: 0.72
F1: 0.77


In [8]:
# n_estimators - ilość drzew kwlaifikacyjnych w modelu
np.random.seed(42)
clf_2 = RandomForestClassifier(n_estimators=100) # piserwsza regulacja
clf_2.fit(X_train, y_train)
y_preds_2 = clf_2.predict(X_valid)
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

Accurracy: 82.22%
Precision: 0.84
Recall: 0.84
F1: 0.84


In [9]:
# max_depth - Maksymalna głębokość drzewa. Jeśli brak, węzły są rozwijane dopóki wszystkie liście nie będą czyste 
# lub dopóki wszystkie liście nie będą zawierać mniej niż min_samples_split próbek.
np.random.seed(42)
clf_3 = RandomForestClassifier(n_estimators=100, max_depth=10) # piserwsza regulacja
clf_3.fit(X_train, y_train)
y_preds_3 = clf_3.predict(X_valid)
clf_3_metrics = evaluate_preds(y_valid, y_preds_3)

Accurracy: 80.00%
Precision: 0.81
Recall: 0.84
F1: 0.82


#### Reczne ustawianie parametrów oprócz n_estimators trochę mija się z celem, po fit ustawia wszystko najlepiej automatycznie

# Losowo za pomocą RandomSearchCV
##### wyszukuje określoną i losową ilość kombinacj wg grid, parametr n_iter, i wybiera najlepszą

In [14]:
from sklearn.model_selection import RandomizedSearchCV

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
        'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [2, 4, 6],
        'min_samples_split': [1, 2, 4]}

np.random.seed(42)
clf = RandomForestClassifier(n_jobs=-1) # The number of jobs to run in parallel. -1 -> wszystkie

# Losowe wyszukiwanie na hiper parametrach.
# RandomizedSearchCV implementuje metodę "fit" i metodę "score". 
# Implementuje również "score_samples", "predict", "predict_proba", "decision_function", "transform" i "inverse_transform", 
# jeśli są one zaimplementowane w używanym estymatorze.
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, # number of models to try, ilość kombinacji na grid
                            cv=5, # cross-validation
                            verbose=2)

# dopasowanie drzewa i ustawienie hiperparametrów
rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=1, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=1, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=1, n_estimato

35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validati

In [15]:
# najlepsze parametry
rs_clf.best_params_

{'n_estimators': 1200,
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 30}

In [16]:
# ocena danych valid
y_preds_rs = rs_clf.predict(X_valid)
rs_clf_metrics = evaluate_preds(y_valid, y_preds_rs)

Accurracy: 84.44%
Precision: 0.82
Recall: 0.92
F1: 0.87


In [17]:
# ocena danych test
y_preds = rs_clf.predict(X_test)
evaluate_preds(y_test, y_preds)

Accurracy: 82.61%
Precision: 0.87
Recall: 0.80
F1: 0.83


{'accuracy': 0.83, 'precision': 0.87, 'recall': 0.8, 'f1': 0.83}

# Kompleksowo za pomocą GridSearchCV
##### sprawdza wszystkie kombinacje wg grid i wybiera najlepszą

In [19]:
from sklearn.model_selection import GridSearchCV
grid

{'n_estimators': [10, 100, 200, 500, 1000, 1200],
 'max_depth': [None, 5, 10, 20, 30],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [1, 2, 4]}

In [20]:
# zmniejszona liczba parametrów żeby liczyło szybciej
grid_2 = {'n_estimators': [100, 200, 500],
          'max_depth': [None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_leaf': [6],
          'min_samples_split': [1, 2]}


In [27]:
np.random.seed(42)
clf = RandomForestClassifier(n_jobs=-1) # The number of jobs to run in parallel. -1 -> wszystkie
gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid_2,
                      cv=5,
                      verbose=2)
gs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_

1800 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1350 fits failed with the following error:
Traceback (most recent call last):
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\__EnvPyTorch\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_va

In [28]:
# najlepsze parametry
gs_clf.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 10}

In [29]:
# ocena danych valid
y_preds_gs = gs_clf.predict(X_valid)
gs_clf_metrics = evaluate_preds(y_valid, y_preds_gs)

Accurracy: 73.33%
Precision: 0.76
Recall: 0.76
F1: 0.76


In [30]:
# ocena danych test
y_preds = gs_clf.predict(X_test)
evaluate_preds(y_test, y_preds)

Accurracy: 76.09%
Precision: 0.82
Recall: 0.72
F1: 0.77


{'accuracy': 0.76, 'precision': 0.82, 'recall': 0.72, 'f1': 0.77}