# Modeling - XGBoost

In [1]:
import sys
import os
sys.path.append('../../src')


import numpy as np
import pickle
from xgboost import XGBClassifier
from modeling import train_model, save_model
from tuning import random_search_tuning
from utils import  generate_combinations, generate_dataset_split, save_combination
from itertools import combinations


## Parameters for tuning

In [2]:
parameters = {
    'objective': ['binary:logistic'],
    'eval_metric': ['aucpr'],
    'scale_pos_weight': np.arange(0, 30, 5),
    'learning_rate': np.arange(0.001, 0.1, 0.005),
    'max_depth': np.arange(2, 8),
    'n_estimators': np.arange(50, 150, 10),
    'subsample': np.arange(0.3, 0.9, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.05),
    'gamma': np.arange(0.1, 5, 0.1),
    'early_stopping_rounds': np.arange(5, 15, 5)
}

## Modeling

In [3]:


cities_siglas = {
    "A": "Porto Alegre",
    "B": "Marabá",
    "C": "Brasília",
    "D": "Belo Horizonte",
    "E": "Juazeiro do Norte",
    "F": "Recife"
}

cities = ["Porto Alegre", "Marabá", "Brasília", "Belo Horizonte", "Juazeiro do Norte", "Recife"]

polos_sigla = ['A', 'B', 'C', 'D', 'E', 'F']
polos = [cities_siglas[s] for s in polos_sigla]

In [4]:
splits = []
for i in range(0, len(polos_sigla)):
    splits.append(generate_combinations(polos[:i] + polos[i+1:], 3, 2))

In [12]:

for i in range(0, len(splits)):
    models = []
    save_combination(f'split3_2/{cities[i]}', "multiclass", splits[i])
    for idx, combination in enumerate(splits[i], start=0):
        print(f"Combinação {idx}: {combination} training")
        X_train, y_train = generate_dataset_split(combination[0], "multiclass")
        X_val, y_val = generate_dataset_split(combination[1], "multiclass")
        clf = XGBClassifier()
        model = random_search_tuning(clf, parameters)
        model = train_model(model, X_train, y_train, [(X_val, y_val)])
        print(model.best_estimator_)
        models.append(model)
    for j in range(0, len(models)):
        pickle.dump(models[j], open(f"../../data/models/multiclass/split3_2/{cities[i]}/xgb_{j}.sav", 'wb'))
        
    

Combinação 0: [['Marabá', 'Brasília', 'Belo Horizonte'], ['Juazeiro do Norte', 'Recife']] training


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [2]:
generate_dataset_split(["Recife"], "multiclass")

(Empty DataFrame
 Columns: [Declividade, Curvatura, APP30m, UCIntegral, AltaTensao, Vias50m, Dutovias, IndiceForma, DomSIden, DomSIlu, DomSPav, DomSCal, DomSFio, DomSBue, DomSArb, DomSEne, DomSAgua, DomSMed, DomSEsg, DomSRedeEsg, DomCLixAc, DomSColLix, DomSColLixDir, DomApto, DomCasa, DomVila, DomImpr, DomAdeq, DomAdeqSN, DomAdeqCN, DomSemiAdeq, DomInadeq, DomPosseOutro, DomSBan, DomNBanDom, DomNBanHab, AguaRede, AguaNascente, AguaCisterna, AguaOutra, EsgotoRede, EsgotoSeptica, EsgotoRudimentar, EsgotoVala, EsgotoRio, EsgotoOutro, LixoLimpeza, LixoQueimado, LixoAterrado, LixoJogado, LixoRio, LixoOutro, LixoCacamba, Ren0SM, RenMeioSM, Ren1a2SM, Ren3SM, RenPopDependente, RenPopAtiva, RenResp3SM, RenRespMedia, NDenDom, NDenPop, NMoradores, NPes10Alf, NRespAlf, NRespFem, NRespIdade, NResp30, NResp30NAlf]
 Index: []
 
 [0 rows x 70 columns],
 Series([], Name: y, dtype: float64),
 'c:\\Users\\Acer\\OneDrive\\Documentos\\GitHub\\xai-nui-classification\\data\\model_input\\multiclass\\X_Recife.