# Modeling - XGBoost

In [6]:
import sys
import os
sys.path.append('../../src')


import numpy as np
import pickle
from xgboost import XGBClassifier
from modeling import train_model, save_model
from tuning import find_best_model
from utils import  generate_combinations, generate_dataset_split, save_combination
from itertools import combinations


  from .autonotebook import tqdm as notebook_tqdm


## Parameters for tuning

In [7]:
parameters = {
    'learning_rate': np.arange(0.001, 0.1, 0.005),
    'max_depth': np.arange(2, 8),
    'n_estimators': np.arange(50, 150, 10),
    'subsample': np.arange(0.3, 0.9, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.05),
    'gamma': np.arange(0.1, 5, 0.1),
    'early_stopping_rounds': np.arange(5, 15, 5),
    'objective': ['multi:softprob'], 
    'num_class': [3],
    'eval_metric':['auc']
}

## Modeling

In [8]:


cities_siglas = {
    "A": "Porto Alegre",
    "B": "Marabá",
    "C": "Brasília",
    "D": "Belo Horizonte",
    "E": "Juazeiro do Norte",
    "F": "Recife"
}

cities = ["Porto Alegre", "Marabá", "Brasília", "Belo Horizonte", "Juazeiro do Norte", "Recife"]

polos_sigla = ['A', 'B', 'C', 'D', 'E', 'F']
polos = [cities_siglas[s] for s in polos_sigla]

In [9]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 3, 2))

In [None]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split3_2/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42, device=["cuda"]).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split3_2/{cities[i]}/xgb_{j}.sav", 'wb'))
        
    

Combinação 0: [['Marabá', 'Brasília', 'Belo Horizonte', 'Juazeiro do Norte'], ['Recife']] training


[I 2024-03-12 17:32:27,876] A new study created in memory with name: no-name-d5fc43a4-179a-4d73-b52b-e33763f288d9
  0%|          | 0/30 [00:00<?, ?it/s]

: 

In [11]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 4, 1))

In [None]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split4_1/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=30)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split4_1/{cities[i]}/xgb_{j}.sav", 'wb'))