# Modelo XGBoost

### Carregamento de bibliotecas

In [2]:
# OS e Manipulação de dados
import os
import sys
import pandas as pd
import numpy as np

# adiocando a path das funções customizadas
currentdir = os.getcwd()
abs_path = os.path.abspath(os.path.join(currentdir, '../../'))
sys.path.append(abs_path)

# Tratamento dos dados
from utils import tratamento_de_dados

# Modelagem
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import dagshub
import mlflow

# Inicialização do Dagshub para trackeamento com MLFlow
dagshub.init(repo_owner='aurelioguilherme',
             repo_name='AmbienteDeDesenvolvimento',
             mlflow=True)


### Leitura e tratamento dos dados

In [4]:
# Leitura dos dados
file_path = os.path.join("../../Data", 'teste_indicium_precificacao.csv')
df = pd.read_csv(file_path)


features_numericas = ['numero_de_reviews',
                      'reviews_por_mes',
                      'calculado_host_listings_count']

features_categoricas = [
    'room_type',
    'bairro_group',
    'minimo_noites_categorico',
    'disponibilidade_365_categorico',
    'ultima_review_semestre',
    'valor_preenchido']

cleaner_data = tratamento_de_dados.TransformData(df, features_categoricas, features_numericas)
X, y  = cleaner_data.fit_transform()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


transformer = ColumnTransformer(
    transformers=[
        # Padronização features numéricas com RobustScaler
        ('num', RobustScaler(), features_numericas),
        # Encondingg das features categóricas com OneHotEncoder
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), features_categoricas)
    ])

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### Treinamento do modelo base

In [5]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)

In [7]:
metricas = {'Métrica': ['R²', 'MAE', 'MSE', 'RMSE'],
            'Valor': [r2_score(y_test, y_pred),
                      mean_absolute_error(y_test, y_pred),
                      mean_squared_error(y_test, y_pred),
                      np.sqrt(mean_squared_error(y_test, y_pred))]}

In [9]:
pd.DataFrame(metricas)

Unnamed: 0,Métrica,Valor
0,R²,-0.095115
1,MAE,72.319155
2,MSE,56355.345496
3,RMSE,237.392808


# MLFlow

In [17]:
mlflow.set_experiment(experiment_name='XGBoost Regressor')

<Experiment: artifact_location='mlflow-artifacts:/9511dcd754984a20b33fa6719d5e4545', creation_time=1738047923644, experiment_id='7', last_update_time=1738047923644, lifecycle_stage='active', name='XGBoost Regressor', tags={}>

In [12]:
with mlflow.start_run(run_name='Execução Modelo Base'):
    model = xgb.XGBRegressor()
    
    model.fit(X_train, y_train)

    mlflow.log_params({"model" : "XGBoost Regressor",
                       "numerical_features" : features_numericas,
                       "categorical_features" : features_categoricas,
                       "scaler" : "RobustScaler",
                       "encoder" : "OneHotEncoder"})   
    y_pred = model.predict(X_test)

    metricas = {'r2' : r2_score(y_test, y_pred),
                'mean_absolute_error' :mean_absolute_error(y_test, y_pred),
                'mean_squared_error'  :  mean_squared_error(y_test, y_pred),
                'root_mean_squared_error' : np.sqrt(mean_squared_error(y_test, y_pred))}
    
    mlflow.log_metrics(metricas)

    mlflow.xgboost.log_model(model, "XGBoostRegressor-PrecificacaoNY")
    mlflow.sklearn.log_model(transformer, "transformer")



🏃 View run Execução Modelo Base at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/7/runs/ae3facef97504a2eb85ca0a3c40c9c2b
🧪 View experiment at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/7


# 2 Experimentação:

- Abordagem com variáveis numéricas


In [13]:
# Leitura dos dados
file_path = os.path.join("../../Data", 'teste_indicium_precificacao.csv')
df = pd.read_csv(file_path)

# Definindo features do modelo
features_numericas = ['numero_de_reviews',
                      'reviews_por_mes',
                      'calculado_host_listings_count',
                      'disponibilidade_365',
                      'latitude',
                      'longitude',
                      'minimo_noites']

features_categoricas = [
    'room_type',
    'bairro_group',
    'valor_preenchido']

# Aplicando o pipeline de tratamento de dados
cleaner_data = tratamento_de_dados.TransformData(df, features_categoricas, features_numericas)
X, y  = cleaner_data.fit_transform()

# Separando os dados
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


transformer = ColumnTransformer(
    transformers=[
        # Padronização features numéricas com RobustScaler
        ('num', RobustScaler(), features_numericas),
        # Encondingg das features categóricas com OneHotEncoder
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), features_categoricas)
    ])

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [14]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

metricas = {'Métrica': ['R²', 'MAE', 'MSE', 'RMSE'],
            'Valor': [r2_score(y_test, y_pred),
                      mean_absolute_error(y_test, y_pred),
                      mean_squared_error(y_test, y_pred),
                      np.sqrt(mean_squared_error(y_test, y_pred))]}

pd.DataFrame(metricas)


Unnamed: 0,Métrica,Valor
0,R²,0.030136
1,MAE,67.769217
2,MSE,49909.85886
3,RMSE,223.405145


In [18]:
with mlflow.start_run(run_name='Execução Modelo com Features Numéricas'):
    model = xgb.XGBRegressor()
    
    model.fit(X_train, y_train)

    mlflow.log_params({"model" : "XGBoost Regressor",
                       "numerical_features" : features_numericas,
                       "categorical_features" : features_categoricas,
                       "scaler" : "RobustScaler",
                       "encoder" : "OneHotEncoder"})   
    y_pred = model.predict(X_test)

    metricas = {'r2' : r2_score(y_test, y_pred),
                'mean_absolute_error' :mean_absolute_error(y_test, y_pred),
                'mean_squared_error'  :  mean_squared_error(y_test, y_pred),
                'root_mean_squared_error' : np.sqrt(mean_squared_error(y_test, y_pred))}
    
    mlflow.log_metrics(metricas)

    mlflow.xgboost.log_model(model, "XGBoostRegressor-PrecificacaoNY")
    mlflow.sklearn.log_model(transformer, "transformer")



🏃 View run Execução Modelo com Features Numéricas at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/7/runs/41dea0537ef74a6298fa501cbf8af1e0
🧪 View experiment at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/7
