# Modeling - XGBoost

In [1]:
import sys
import os
sys.path.append('../../src')


import numpy as np
import pickle
from xgboost import XGBClassifier
from modeling import train_model, save_model
from tuning import find_best_model
from utils import  generate_combinations, generate_dataset_split, save_combination
from itertools import combinations


## Parameters for tuning

In [2]:
parameters = {
    'learning_rate': np.arange(0.001, 0.1, 0.005),
    'max_depth': np.arange(2, 8),
    'n_estimators': np.arange(50, 150, 10),
    'subsample': np.arange(0.3, 0.9, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.05),
    'gamma': np.arange(0.1, 5, 0.1),
    'early_stopping_rounds': np.arange(5, 15, 5),
    'objective': ['multi:softprob'], 
    'num_class': [3],
    'eval_metric':['auc']
}

## Modeling

In [3]:


cities_siglas = {
    "A": "Porto Alegre",
    "B": "Marabá",
    "C": "Brasília",
    "D": "Belo Horizonte",
    "E": "Juazeiro do Norte",
    "F": "Recife"
}

cities = ["Porto Alegre", "Marabá", "Brasília", "Belo Horizonte", "Juazeiro do Norte", "Recife"]

polos_sigla = ['A', 'B', 'C', 'D', 'E', 'F']
polos = [cities_siglas[s] for s in polos_sigla]

In [4]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 3, 2))

In [5]:
for i in range(0, len(splits)):
    models = []
    save_combination(f'split3_2/{cities[i]}', "multiclass", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "multiclass")
        X_val, y_val = generate_dataset_split(combination[1], "multiclass")
        best_params = find_best_model("multi_xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split3_2/{cities[i]}/xgb_{j}.sav", 'wb'))

Combinação 0: [['Marabá', 'Brasília', 'Belo Horizonte'], ['Juazeiro do Norte', 'Recife']] training


[I 2024-03-20 04:52:30,845] A new study created in memory with name: no-name-a6c1058d-c9fc-4731-9fad-d0e82d73595a


  0%|          | 0/60 [00:00<?, ?it/s]

[W 2024-03-20 04:53:14,953] Trial 0 failed with parameters: {'n_estimators': 111, 'learning_rate': 0.07417125371053147, 'max_depth': 7, 'subsample': 0.437272370972978, 'colsample_bytree': 0.9539238820639565, 'min_child_weight': 2, 'gamma': 0.9166495644380596} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/work/caio.rhoden/test/xai-nui-classification/notebooks/Modeling/../../src/tuning.py", line 143, in <lambda>
    study.optimize(lambda trial: _objective_multi_xgb(trial, x_train, y_train, x_val, y_val), n_trials=trials, show_progress_bar=True)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/caio.rhoden/test/xai-nui-classification/notebooks/Modeling/../../src/tuning.py", line 45, in _objective_multi_xgb
    xgb_clf

KeyboardInterrupt: 

In [None]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 4, 1))

In [None]:
for i in range(0, len(splits)):
    models = []
    save_combination(f'split4_1/{cities[i]}', "multiclass", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "multiclass")
        X_val, y_val = generate_dataset_split(combination[1], "multiclass")
        best_params = find_best_model("multi_xgb", X_train, y_train, X_val, y_val, trials=60)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split4_1/{cities[i]}/xgb_{j}.sav", 'wb'))