<a href="https://colab.research.google.com/github/G2454/UTFPR-IA-25.2/blob/main/Hyperparameter_DecisionTree_BreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Base de Dados: Breast Cancer
* https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
* Classe: Diagnosis (M = malignant, B = benign)


In [None]:
import numpy as np
import pandas as pd

# Modelo machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Validação Cruzada
from sklearn.model_selection import (
    KFold,
    LeaveOneOut,
    StratifiedKFold,
    cross_validate
)

# Métricas
from sklearn.metrics import (recall_score,
                             accuracy_score,
                             precision_score,
                             f1_score)
from sklearn.metrics import classification_report

In [None]:
def carregaBaseDados(nome):
  return pd.read_csv(nome)

Pré-processamento

In [None]:
def preProcessamento(dataframe, rem_cols, class_column, normalization_cols):

  # Remoção de columnas irrelevantes
  dataframe.drop(rem_cols, axis = 1, inplace = True)

  # Transforma os dados da classe diagnosis
  le = LabelEncoder()
  dataframe[class_column] = le.fit_transform(dataframe[class_column])

  # Normalização dos dados
  scaler = StandardScaler()
  dataframe[normalization_cols] = scaler.fit_transform(dataframe[normalization_cols])

  return dataframe


In [None]:
# Separa atributos da classe (X,y)
def separaClasse(dataframe, classe):
  X = dataframe.drop(classe, axis=1)
  y = dataframe[classe]
  return X,y


### Amostragem

Holdout

In [None]:
# Separa os conjuntos em treino e teste (70%/30%)
# Abordagem Holdout
def separaTreinoTeste(X, y):
  return train_test_split(X,y, test_size=0.3)

K-fold Cross-validation

In [None]:
def KFCross(model,X,y):
  kf = KFold(n_splits = 10, shuffle = True)
  clf = cross_validate(
      eval(model),
      X,y,
      scoring = 'accuracy',
      cv = kf
  )
  return clf


Stratified K-fold Cross-validation

In [None]:
def Skf(model,X,y):
  skf = StratifiedKFold(n_splits = 10, shuffle = True)
  clf = cross_validate(
      eval(model),
      X,y,
      scoring = 'accuracy',
      cv = skf
  )
  return clf

### Modelo preditivo

In [None]:
# Gera o modelo preditivo
def geraModelo(modelo, X,y):
  modelo = eval(modelo)
  modelo.fit(X,y)
  return modelo


### Métricas

In [None]:
def metricaReport(y_test, y_pred):
  print(classification_report(y_test, y_pred))

### Realizando testes com os dados:

In [None]:
# Gera o dataframe
df = carregaBaseDados('data.csv')

In [None]:
df = preProcessamento(df, ['id', 'Unnamed: 32'], 'diagnosis', ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'])

In [None]:
# Separa atributos e classe
X, y = separaClasse(df,'diagnosis')

In [None]:
# Gerar conjunto treino e teste
X_train, X_test, y_train, y_test = separaTreinoTeste(X,y)

In [None]:
# Avalia os dados
modelo = geraModelo('DecisionTreeClassifier()', X_train, y_train)
score = modelo.score(X_test, y_test)
y_pred = modelo.predict(X_test)
print(score)

0.9122807017543859


In [None]:
# Avalia o modelo com mais métricas
metricaReport(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       114
           1       0.85      0.89      0.87        57

    accuracy                           0.91       171
   macro avg       0.90      0.91      0.90       171
weighted avg       0.91      0.91      0.91       171



Testando validação cruzada

In [None]:
cv = KFCross('DecisionTreeClassifier()', X,y)
print(f"{cv['test_score']}\nMedia: {np.mean(cv['test_score'])}")

[0.94736842 0.94736842 0.98245614 0.89473684 0.85964912 0.92982456
 0.9122807  0.92982456 0.9122807  0.92857143]
Media: 0.924436090225564


In [None]:
cv = Skf('DecisionTreeClassifier()', X,y)
print(f"{cv['test_score']}\nMedia: {np.mean(cv['test_score'])}")

[0.92982456 0.9122807  0.94736842 0.85964912 0.92982456 0.9122807
 0.94736842 0.96491228 0.98245614 0.91071429]
Media: 0.9296679197994988


### Técnicas de ajuste de hiperparâmetros
- Abordagens disponíveis no scikit-learn:
    - GridSearchCV: considera exaustivamente todas as combinações de parâmetros;
    - RandomizedSearchCV: pesquisa aleatória de parâmetros, em que cada configuração é amostrada a partir de uma distribuição de possíveis valores de parâmetro.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import sys

#### Árvore de Decisão

In [None]:
DT = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'splitter': ['best', 'random'],
              'max_features': [sys.maxsize, 1.0, 'sqrt', 'log2', None]
              }

* GridSearchCV
  - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
  - https://vitalflux.com/grid-search-explained-python-sklearn-examples/

In [None]:
g_search = GridSearchCV(estimator = DT, param_grid = param_grid,
                        refit=True, cv = 10, return_train_score=True)

In [None]:
g_search.fit(X_train, y_train)
print(g_search.best_params_)

{'criterion': 'entropy', 'max_features': None, 'splitter': 'random'}


In [None]:
g_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_features', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'split5_train_score', 'split6_train_score', 'split7_train_score', 'split8_train_score', 'split9_train_score', 'mean_train_score', 'std_train_score'])

In [None]:
# g_results =  pd.DataFrame(g_search.cv_results_)

In [None]:
# # Obtém a média das acurácias (10 folds) referente ao conjunto teste
# g_results.loc[g_search.best_index_,'mean_test_score']

In [None]:
print(g_search.best_index_)

19


In [None]:
print(g_search.best_score_)

0.9521153846153847


In [None]:
max(g_search.cv_results_['mean_test_score'])

0.9521153846153847

In [None]:
# Avalia o conjunto teste  com o melhor conjunto de parâmetros encontrado
# best_estimator_ .Para tanto, o parâmetro refit precisa ser igual a True
model = g_search.best_estimator_
model.score(X_test,y_test)

0.9415204678362573

- RandomizedSearchCV
  - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
r_search = RandomizedSearchCV(estimator = DT, param_distributions = param_grid,
                        n_iter= 10, cv = 10, return_train_score=True, refit=True)

In [None]:
r_search.fit(X_train, y_train);
print(r_search.best_params_)

{'splitter': 'random', 'max_features': None, 'criterion': 'entropy'}


In [None]:
print(r_search.best_score_)

0.9520512820512821


In [None]:
max(r_search.cv_results_['mean_test_score'])

0.9520512820512821

In [None]:
# Avalia o conjunto teste  com o melhor conjunto de parâmetros encontrado
# best_estimator_ .Para tanto, o parâmetro refit precisa ser igual a True
model = r_search.best_estimator_
model.score(X_test,y_test)

0.9415204678362573