# 1. Importações

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn import svm
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings("ignore")


In [None]:
path = '/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/02. Arquivos/01. Dados/02. Dados para treino e teste 50dim/'
cols = pd.read_csv(path + 'x_train.csv', nrows=1).columns
x_train = pd.read_csv(path + 'x_train.csv', index_col=False, usecols=cols[:-1])
x_test = pd.read_csv(path + 'x_test.csv', index_col=False, usecols=cols[:-1])
y_train = pd.read_csv(path + 'y_train.csv', index_col=False, usecols=['classification'])
y_test = pd.read_csv(path + 'y_test.csv', index_col=False, usecols=['classification'])

len(x_train), len(x_test), len(y_train), len(y_test)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((1106, 50), (277, 50), (1106, 1), (277, 1))

# 4. XGBoostClassifier
**Necessário habilitar a GPU para executar**


In [None]:
# create a default XGBoost classifier
model = XGBClassifier(
    tree_method = "gpu_hist", 
    random_state=300, 
    eval_metric=["error", "auc"]
)

In [None]:
# Create the grid search parameter grid and scoring funcitons
param_grid = {
    "learning_rate": [0.1, 0.01],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "subsample": [0.6, 0.8, 1.0],
    "max_depth": [2, 3, 4],
    "n_estimators": [100, 200, 300, 400],
    "reg_lambda": [1, 1.5, 2],
    "gamma": [0, 0.1, 0.3],
}
scoring = {
    'AUC': 'roc_auc', 
    'Accuracy': make_scorer(accuracy_score)
}

In [None]:
gridXGB = GridSearchCV(model,param_grid, verbose=2, n_jobs=-1)

In [None]:
# fit grid search
%time best_model = gridXGB.fit(x_train,y_train)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 42.9min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 55.7min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 71.3min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 87.4min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 106.8min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 126.7min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 149.4min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 173.1min
[Parallel(n_jobs=-1)]: Done 9109 tasks    

CPU times: user 47.3 s, sys: 4.57 s, total: 51.9 s
Wall time: 3h 32min 52s


In [None]:
print(f'Best score: {best_model.best_score_}')
print(f'Best model: {best_model.best_params_}')

Best score: 0.8011047246341363
Best model: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 1, 'subsample': 0.6}


In [None]:
pred_test = best_model.predict(x_test)
pred_train = best_model.predict(x_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('\nConfusion Matrix:')
print(confusion_matrix(y_test,pred_test))
print('\nClassification Report:')
report = classification_report(y_test, pred_test, output_dict=True)
print(report)

Train Accuracy:  0.9927667269439421
Test Accuraccy:  0.8194945848375451

Confusion Matrix:
[[106  24]
 [ 26 121]]

Classification Report:
{'0': {'precision': 0.803030303030303, 'recall': 0.8153846153846154, 'f1-score': 0.8091603053435115, 'support': 130}, '1': {'precision': 0.8344827586206897, 'recall': 0.8231292517006803, 'f1-score': 0.8287671232876712, 'support': 147}, 'accuracy': 0.8194945848375451, 'macro avg': {'precision': 0.8187565308254963, 'recall': 0.8192569335426478, 'f1-score': 0.8189637143155913, 'support': 277}, 'weighted avg': {'precision': 0.819721678379714, 'recall': 0.8194945848375451, 'f1-score': 0.8195653675738056, 'support': 277}}


In [None]:
resultados = pd.DataFrame({
    'precision': report['weighted avg']['precision'], 
    'recall': report['weighted avg']['recall'],
    'f1score': report['weighted avg']['f1-score'],
    'y_predict': [pred_test],
    'y_real': [y_test['classification'].values],
    'hiperparams': str(best_model.best_params_)
})
resultados.to_json('/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/05. Resultados/5.1. Resultados dos modelos/10. XGBooster/xgbooster_50dim.json')