# Modeling - XGBoost

In [1]:
import sys
import os
sys.path.append('../../src')


import numpy as np
import pickle
from xgboost import XGBClassifier
from modeling import train_model, save_model
from tuning import find_best_model
from utils import  generate_combinations, generate_dataset_split, save_combination
from itertools import combinations


## Parameters for tuning

In [2]:
parameters = {
    'learning_rate': np.arange(0.001, 0.1, 0.005),
    'max_depth': np.arange(2, 8),
    'n_estimators': np.arange(50, 150, 10),
    'subsample': np.arange(0.3, 0.9, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.05),
    'gamma': np.arange(0.1, 5, 0.1),
    'early_stopping_rounds': np.arange(5, 15, 5),
    'objective': ['multi:softprob'], 
    'num_class': [3],
    'eval_metric':['auc']
}

## Modeling

In [3]:


cities_siglas = {
    "A": "Porto Alegre",
    "B": "Marabá",
    "C": "Brasília",
    "D": "Belo Horizonte",
    "E": "Juazeiro do Norte",
    "F": "Recife"
}

cities = ["Porto Alegre", "Marabá", "Brasília", "Belo Horizonte", "Juazeiro do Norte", "Recife"]

polos_sigla = ['A', 'B', 'C', 'D', 'E', 'F']
polos = [cities_siglas[s] for s in polos_sigla]

In [4]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 3, 2))

In [None]:
for i in range(0, len(splits)):
    models = []
    save_combination(f'split3_2/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split3_2/{cities[i]}/xgb_{j}.sav", 'wb'))

Combinação 0: [['Marabá', 'Brasília', 'Belo Horizonte'], ['Juazeiro do Norte', 'Recife']] training


[I 2024-03-20 04:48:07,721] A new study created in memory with name: no-name-bf45a938-64f4-4b14-945c-68accf9a7bda


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-20 04:50:07,865] Trial 0 finished with value: 0.7362601999840908 and parameters: {'n_estimators': 115, 'learning_rate': 0.019573198528162947, 'max_depth': 7, 'subsample': 0.37451055976457087, 'colsample_bytree': 0.8263774722049284, 'min_child_weight': 5, 'gamma': 3.270081185327834}. Best is trial 0 with value: 0.7362601999840908.
[I 2024-03-20 04:52:29,584] Trial 1 finished with value: 0.694430764599113 and parameters: {'n_estimators': 143, 'learning_rate': 0.001807203712647238, 'max_depth': 7, 'subsample': 0.586874125581083, 'colsample_bytree': 0.7485561230968918, 'min_child_weight': 13, 'gamma': 1.102801187907732}. Best is trial 0 with value: 0.7362601999840908.
[I 2024-03-20 04:53:31,934] Trial 2 finished with value: 0.7324987525888625 and parameters: {'n_estimators': 101, 'learning_rate': 0.015507559466614338, 'max_depth': 2, 'subsample': 0.8593242853559373, 'colsample_bytree': 0.9928970422175601, 'min_child_weight': 20, 'gamma': 4.52334844647297}. Best is trial 0 with v

In [None]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 4, 1))

In [None]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split4_1/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split4_1/{cities[i]}/xgb_{j}.sav", 'wb'))

In [None]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 3, 2))

In [None]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split3_2/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split3_2/{cities[i]}/xgb_{j}.sav", 'wb'))
        
    

In [None]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 4, 1))

In [None]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split4_1/{cities[i]}', "binary", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "binary")
        X_val, y_val = generate_dataset_split(combination[1], "binary")
        best_params = find_best_model("xgb", X_train, y_train, X_val, y_val, trials=25)
        model = XGBClassifier(**best_params.params, random_state=42).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        models.append(model)
        break
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/binary/split4_1/{cities[i]}/xgb_{j}.sav", 'wb'))