# 1. Importações

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn import svm
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings("ignore")


In [4]:
path = '/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/02. Arquivos/01. Dados/03. Dados para treino e teste 100dim/'
cols = pd.read_csv(path + 'x_train.csv', nrows=1).columns
x_train = pd.read_csv(path + 'x_train.csv', index_col=False, usecols=cols[:-1])
x_test = pd.read_csv(path + 'x_test.csv', index_col=False, usecols=cols[:-1])
y_train = pd.read_csv(path + 'y_train.csv', index_col=False, usecols=['classification'])
y_test = pd.read_csv(path + 'y_test.csv', index_col=False, usecols=['classification'])

len(x_train), len(x_test), len(y_train), len(y_test)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((1106, 100), (277, 100), (1106, 1), (277, 1))

# 4. XGBoostClassifier
**Necessário habilitar a GPU para executar**


In [5]:
# create a default XGBoost classifier
model = XGBClassifier(
    tree_method = "gpu_hist", 
    random_state=300, 
    eval_metric=["error", "auc"]
)

In [6]:
# Create the grid search parameter grid and scoring funcitons
param_grid = {
    "learning_rate": [0.1, 0.01],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "subsample": [0.6, 0.8, 1.0],
    "max_depth": [2, 3, 4],
    "n_estimators": [100, 200, 300, 400],
    "reg_lambda": [1, 1.5, 2],
    "gamma": [0, 0.1, 0.3],
}
scoring = {
    'AUC': 'roc_auc', 
    'Accuracy': make_scorer(accuracy_score)
}

In [7]:
gridXGB = GridSearchCV(model,param_grid, verbose=2, n_jobs=-1)

In [8]:
# fit grid search
%time best_model = gridXGB.fit(x_train,y_train)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 43.9min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 60.2min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 78.2min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 100.1min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 122.9min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 150.1min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 178.2min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 210.2min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 243.7min
[Parallel(n_jobs=-1)]: Done 9109 tasks  

CPU times: user 1min 11s, sys: 7.11 s, total: 1min 19s
Wall time: 4h 59min 31s


In [12]:
print(f'Best score: {best_model.best_score_}')
print(f'Best model: {best_model.best_params_}')

Best score: 0.8137540255187312
Best model: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 400, 'reg_lambda': 1.5, 'subsample': 0.8}


In [13]:
pred_test = best_model.predict(x_test)
pred_train = best_model.predict(x_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('\nConfusion Matrix:')
print(confusion_matrix(y_test,pred_test))
print('\nClassification Report:')
report = classification_report(y_test, pred_test, output_dict=True)
print(report)

Train Accuracy:  0.9936708860759493
Test Accuraccy:  0.8339350180505415

Confusion Matrix:
[[109  21]
 [ 25 122]]

Classification Report:
{'0': {'precision': 0.8134328358208955, 'recall': 0.8384615384615385, 'f1-score': 0.8257575757575758, 'support': 130}, '1': {'precision': 0.8531468531468531, 'recall': 0.8299319727891157, 'f1-score': 0.8413793103448276, 'support': 147}, 'accuracy': 0.8339350180505415, 'macro avg': {'precision': 0.8332898444838743, 'recall': 0.8341967556253271, 'f1-score': 0.8335684430512017, 'support': 277}, 'weighted avg': {'precision': 0.8345085056653568, 'recall': 0.8339350180505415, 'f1-score': 0.8340478103580308, 'support': 277}}


In [14]:
resultados = pd.DataFrame({
    'precision': report['weighted avg']['precision'], 
    'recall': report['weighted avg']['recall'],
    'f1score': report['weighted avg']['f1-score'],
    'y_predict': [pred_test],
    'y_real': [y_test['classification'].values],
    'hiperparams': str(best_model.best_params_)
})
resultados.to_json('/content/drive/My Drive/[2020.1] APRENDIZADO DE MÁQUINA/TRABALHO/05. Resultados/5.1. Resultados dos modelos/10. XGBooster/xgbooster_100dim.json')