## IMPORTS


In [None]:
import numpy as np
import pandas as pd

from tabulate import tabulate
from warnings import simplefilter

from sklearn import model_selection, manifold, datasets
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, mean_absolute_error

from sklearn.svm import SVC, SVR
from sklearn.manifold import TSNE
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network  import MLPClassifier, MLPRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor

simplefilter(action='ignore', category=FutureWarning)

# Questão 1

In [None]:
# Breast Cancer Dataset
X, y = load_breast_cancer(return_X_y=True)

T=5 # número de folds da prova

In [None]:
# BaggingClassifier()
name='BaggingClassifier'

parameters = [{'n_estimators': [5, 10, 50, 100], 
               'max_samples': [0.6, 0.8, 1.0, 1.2, 1.4],
               'max_features': [0.6, 0.8, 1.0, 1.2, 1.4], 
               'bootstrap': [True, False],
               'bootstrap_features': [True, False]}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

dt =  DecisionTreeClassifier(criterion='entropy')
model = BaggingClassifier(base_estimator=dt, random_state=42)

gs = GridSearchCV(model, parameters, scoring = 'accuracy', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando a acurácia média e desvio padrão.
print("\nCross Validation Results %d folds:" % T)
print(f"{name} Mean Accuracy: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando a predição para exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando para cada instância de teste a probabilidade de cada classe
predicted_proba=model_selection.cross_val_predict(clf, X, y, cv=T, method='predict_proba')

# Calculando a precisão na base de teste
precision=precision_score(y, y_pred, average='weighted')
print("Precision = %.3f " % precision)

# Calculando a revocação na base de teste
recall=recall_score(y, y_pred, average='weighted')
print("Recall = %.3f " % recall)

# Calculando f1 na base de teste
f1=f1_score(y, y_pred, average='weighted')
print("F1 = %.3f " % f1)

# Calculando a matriz de confusão
print("\nMatriz de Confusão:")
matrix = confusion_matrix(y, y_pred)
print(matrix)

1280 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
800 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 328, in _fit
    raise ValueError("max_samples must be in (0, n_samples]")
ValueError: max_samples must be in (0, n_samples]

-------------------------------------

+-----------------+----------------+-------------------+------------------+-------------------+----------------------------+----------------------+---------------------+----------------------+-----------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_bootstrap   | param_bootstrap_features   |   param_max_features |   param_max_samples |   param_n_estimators | params                                                                                                          |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+----------------+------------

In [None]:
# RandomForestClassifier()
name = "RandomForestClassifier"

parameters = [{'n_estimators': [5, 10, 50], 
               'criterion': ['gini', 'entropy'],
               'max_depth': [1, 3, 5], 
               'min_samples_split': [5, 10, 15],
               'min_samples_leaf': [5, 10, 15], 
               'max_features': ['sqrt', 'log2'],
               'class_weight': ['balanced', 'balanced_subsample']}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

model = RandomForestClassifier(random_state=42)

gs = GridSearchCV(model, parameters, scoring ='accuracy', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando a acurácia média e desvio padrão.
print("\nCross Validation Results %d folds:" % T)
print(f"{name} Mean Accuracy: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando a predição para exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando para cada instância de teste a probabilidade de cada classe
predicted_proba=model_selection.cross_val_predict(clf, X, y, cv=T, method='predict_proba')

# Calculando a precisão na base de teste
precision=precision_score(y, y_pred, average='weighted')
print("Precision = %.3f " % precision)

# Calculando a revocação na base de teste
recall=recall_score(y, y_pred, average='weighted')
print("Recall = %.3f " % recall)

# Calculando f1 na base de teste
f1=f1_score(y, y_pred, average='weighted')
print("F1 = %.3f " % f1)

# Calculando a matriz de confusão
print("\nMatriz de Confusão:")
matrix = confusion_matrix(y, y_pred)
print(matrix)

+-----------------+----------------+-------------------+------------------+----------------------+-------------------+-------------------+----------------------+--------------------------+---------------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_class_weight   | param_criterion   |   param_max_depth | param_max_features   |   param_min_samples_leaf |   param_min_samples_split |   param_n_estimators | params                                                                                                                                                                      |   s

In [None]:
# MLPClassifier()
name = "MLPClassifier"

parameters = [{'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
               'activation': ['identity','logistic', 'tanh', 'relu'], 
               'solver': ['sgd', 'adam'],
               'alpha': [0.0001, 0.05],
               'learning_rate': ['constant', 'adaptative']}]

model = MLPClassifier(random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

gs = GridSearchCV(model, parameters, scoring ='accuracy', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando a acurácia média e desvio padrão.
print("\nCross Validation Results %d folds:" % T)
print(f"{name} Mean Accuracy: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando a predição para exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando para cada instância de teste a probabilidade de cada classe
predicted_proba=model_selection.cross_val_predict(clf, X, y, cv=T, method='predict_proba')

# Calculando a precisão na base de teste
precision=precision_score(y, y_pred, average='weighted')
print("Precision = %.3f " % precision)

# Calculando a revocação na base de teste
recall=recall_score(y, y_pred, average='weighted')
print("Recall = %.3f " % recall)

# Calculando f1 na base de teste
f1=f1_score(y, y_pred, average='weighted')
print("F1 = %.3f " % f1)

# Calculando a matriz de confusão
print("\nMatriz de Confusão:")
matrix = confusion_matrix(y, y_pred)
print(matrix)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/

+-----------------+----------------+-------------------+------------------+--------------------+---------------+----------------------------+-----------------------+----------------+-----------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_activation   |   param_alpha | param_hidden_layer_sizes   | param_learning_rate   | param_solver   | params                                                                                                                            |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+-------------




Cross Validation Results 5 folds:
MLPClassifier Mean Accuracy: 0.93626
Mean Std: 0.04135
Precision = 0.925 
Recall = 0.924 
F1 = 0.924 

Matriz de Confusão:
[[180  32]
 [ 11 346]]


In [None]:
# SVC
name='SVM'

parameters = [
  {'kernel': ['poly'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000], 
   'degree': [1, 2, 3, 4, 5], 'gamma': [0.1, 0.001, 0.0001, 0.00001]},

  {'kernel': ['rbf', 'sigmoid'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000], 
   'gamma': [0.1, 0.001, 0.0001, 0.00001]},

  {'kernel': ['linear'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000]}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

model = SVC(gamma='scale', probability=True, random_state=42)

gs = GridSearchCV(model, parameters, scoring = 'accuracy', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando a acurácia média e desvio padrão.
print("\nCross Validation Results %d folds:" % T)
print(f"{name} Mean Accuracy: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando a predição para exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando para cada instância de teste a probabilidade de cada classe
predicted_proba=model_selection.cross_val_predict(clf, X, y, cv=T, method='predict_proba')

# Calculando a precisão na base de teste
precision=precision_score(y, y_pred, average='weighted')
print("Precision = %.3f " % precision)

# Calculando a revocação na base de teste
recall=recall_score(y, y_pred, average='weighted')
print("Recall = %.3f " % recall)

# Calculando f1 na base de teste
f1=f1_score(y, y_pred, average='weighted')
print("F1 = %.3f " % f1)

# Calculando a matriz de confusão
print("\nMatriz de Confusão:")
matrix = confusion_matrix(y, y_pred)
print(matrix)

  conv(string)
  return format(float(val), floatfmt)


+-----------------+----------------+-------------------+------------------+-----------+----------------+---------------+----------------+-------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time |   param_C |   param_degree |   param_gamma | param_kernel   | params                                                      |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+----------------+-------------------+------------------+-----------+----------------+---------------+----------------+-------------------------------------------------------------+---------------------+---------------------+-------------------

# Questão 2

In [None]:
# Diabetes Dataset
X, y=load_diabetes(return_X_y=True)

T=5 # número de folds da prova

In [None]:
#BaggingRegressor()
name = 'BaggingRegressor'

parameters = [{'n_estimators': [5, 10, 50, 100], 
               'max_samples': [0.6, 0.8, 1.0, 1.2, 1.4],
               'max_features': [0.6, 0.8, 1.0, 1.2, 1.4], 
               'bootstrap': [True, False],
               'bootstrap_features': [True, False]}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

dtr =  DecisionTreeRegressor(criterion='absolute_error')
model = BaggingRegressor(base_estimator=dtr, random_state=42)

gs = GridSearchCV(model, parameters, scoring='r2', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando R2 médio e desvio padrão calculados na validação cruzada.
print("\nCross Validation Results %d folds:" % T)
print("R2 médio: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando o valor para cada exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando o erro médio absoluto
mae=mean_absolute_error(y, y_pred)
print(f"{name} Mean Absolute Error (MAE): %.5f" % mae)

1280 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
800 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 328, in _fit
    raise ValueError("max_samples must be in (0, n_samples]")
ValueError: max_samples must be in (0, n_samples]

-------------------------------------

+-----------------+----------------+-------------------+------------------+-------------------+----------------------------+----------------------+---------------------+----------------------+-----------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_bootstrap   | param_bootstrap_features   |   param_max_features |   param_max_samples |   param_n_estimators | params                                                                                                          |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+----------------+------------

In [None]:
#RandomForestRegressor()
name='RandomForestRegressor'

parameters = [{'n_estimators': [5, 10, 50], 
               'criterion': ['squared_error', 'absolute_error', 'poisson'], 
               'min_samples_split': [5, 10, 15],
               'min_samples_leaf': [5, 10, 15], 
               'max_features': ['sqrt', 'log2']}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

model = RandomForestRegressor(random_state=42)

gs = GridSearchCV(model, parameters, scoring='r2', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando R2 médio e desvio padrão calculados na validação cruzada.
print("\nCross Validation Results %d folds:" % T)
print("R2 médio: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando o valor para cada exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando o erro médio absoluto
mae=mean_absolute_error(y, y_pred)
print(f"{name} Mean Absolute Error (MAE): %.5f" % mae)

+-----------------+----------------+-------------------+------------------+-------------------+----------------------+--------------------------+---------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_criterion   | param_max_features   |   param_min_samples_leaf |   param_min_samples_split |   param_n_estimators | params                                                                                                                       |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-------------

In [None]:
#MLPRegressor()
name = 'MLPRegressor'

parameters = [{'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
               'activation': ['logistic', 'tanh', 'relu'], 
               'solver': ['sgd', 'adam'],
               'alpha': [0.0001, 0.05],
               'learning_rate': ['constant', 'adaptative']}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

model = MLPRegressor(random_state=42)

gs = GridSearchCV(model, parameters, scoring='r2', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando R2 médio e desvio padrão calculados na validação cruzada.
print("\nCross Validation Results %d folds:" % T)
print("R2 médio: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando o valor para cada exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando o erro médio absoluto
mae=mean_absolute_error(y, y_pred)
print(f"{name} Mean Absolute Error (MAE): %.5f" % mae)

180 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 752, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 384, in _fit
    self._validate_hyperparameters()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 

+-----------------+----------------+-------------------+------------------+--------------------+---------------+----------------------------+-----------------------+----------------+-----------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time | param_activation   |   param_alpha | param_hidden_layer_sizes   | param_learning_rate   | param_solver   | params                                                                                                                            |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+-------------

In [None]:
#SVR
name='SVR'

parameters = [
  {'kernel': ['poly'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000], 
   'degree': [1, 2, 3, 4, 5], 'gamma': [0.1, 0.001, 0.0001, 0.00001]},

  {'kernel': ['rbf', 'sigmoid'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000], 
   'gamma': [0.1, 0.001, 0.0001, 0.00001]},

  {'kernel': ['linear'], 
   'C': [0.1, 0.5, 1, 10, 100, 500, 1000]}]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

model = SVR(gamma='scale')

gs = GridSearchCV(model, parameters, scoring='r2', cv=T)
gs.fit(X_val, y_val)

df=gs.cv_results_
print(tabulate(df, headers='keys', tablefmt='psql'))
print('Best params:', gs.best_params_)

clf=gs.best_estimator_
result = model_selection.cross_val_score(clf, X_train, y_train, cv=T)

# Mostrando R2 médio e desvio padrão calculados na validação cruzada.
print("\nCross Validation Results %d folds:" % T)
print("R2 médio: %.5f" % result.mean())
print("Mean Std: %.5f" % result.std())

# Calculando o valor para cada exemplo de teste
y_pred = model_selection.cross_val_predict(clf, X, y, cv=T)

# Calculando o erro médio absoluto
mae=mean_absolute_error(y, y_pred)
print(f"{name} Mean Absolute Error (MAE): %.5f\n" % mae)

+-----------------+----------------+-------------------+------------------+-----------+----------------+---------------+----------------+-------------------------------------------------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+------------------+-------------------+
|   mean_fit_time |   std_fit_time |   mean_score_time |   std_score_time |   param_C |   param_degree |   param_gamma | param_kernel   | params                                                      |   split0_test_score |   split1_test_score |   split2_test_score |   split3_test_score |   split4_test_score |   mean_test_score |   std_test_score |   rank_test_score |
|-----------------+----------------+-------------------+------------------+-----------+----------------+---------------+----------------+-------------------------------------------------------------+---------------------+---------------------+-------------------

  conv(string)
  return format(float(val), floatfmt)
