# Gradient Boosting Trees

#### Importación de librerías ⬇️

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.tree import export_text, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OrdinalEncoder
from imblearn.combine import SMOTEENN

random_state = 42
test_size = 0.2

### Carga y Preparación de Datos

In [2]:
# Importar data de un csv
df = pd.read_csv('../data/los_data_gt_01.csv')

In [3]:
column_dtypes = df.dtypes

# Iterate through each column dtype and change dtype to "category" if it's "object"
for col_name, dtype in column_dtypes.items():
    if dtype == 'object':
        df[col_name] = df[col_name].astype('category')

In [4]:
# Initialize the encoder
encoder = OrdinalEncoder()

# Reshape the input data to a 2-dimensional array
data = df[['causa_atencion', 'municipio']]  # Selecting the columns you want to encode
data = data.values  # No need to reshape as we have multiple columns

# Fit and transform the encoder on the reshaped data
encoded_data = encoder.fit_transform(data)

# Assign the encoded data back to the DataFrame
df[['causa_atencion', 'municipio']] = encoded_data

### Funciones de Utilidad

In [5]:
def estMetrics(est, X, y):

    y_pred = est.predict(X)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y, y_pred)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y, y_pred)

    # Calculate R-squared (R^2)
    r2 = r2_score(y, y_pred)

    # Print the metrics
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R^2): {r2:.4f}")

In [6]:
def CreateAndTestRegressor(X_train, X_test, y_train, y_test):
    est = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=random_state)
    est.fit(X_train, y_train)

    print("Performance Metrics on Test Set:")
    estMetrics(est, X_test, y_test)

    print("\nPerformance Metrics on Train Set:")
    estMetrics(est, X_train, y_train)

    return est

In [7]:
def TuneAndTestRegressor(X_train, X_test, y_train, y_test):
    param_grid = {
        'max_depth': range(5, 16, 5),  # Maximum depth of each tree
        'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks the contribution of each tree
        'max_iter': [100, 200, 300],  # Maximum number of boosting iterations
        'loss': ['squared_error', 'absolute_error', 'gamma', 'poisson']
    }

    cv = KFold(n_splits=4, shuffle=True, random_state=random_state)

    model = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=random_state)

    est = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        return_train_score=True,
        cv=cv,
    ).fit(X_train, y_train)

    print("Best Parameters For Regressor:", est.best_params_)
    print()

    print("Performance Metrics on Test Set:")
    estMetrics(est, X_test, y_test)

    print("\nPerformance Metrics on Train Set:")
    estMetrics(est, X_train, y_train)

    return est

## Primera Iteración de Modelo de Regresión

### Separación del Conjunto de Datos

In [8]:
df_cp = df.copy()

# Separate the target variable 'dias_estancia' into y and the rest of the DataFrame into X
y = df_cp.pop('dias_estancia')
X = df_cp

print(X.shape)
print(y.shape)

(2228328, 10)
(2228328,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print(X_train.shape)
print(y_train.shape)

(1782662, 10)
(1782662,)


### Entrenamiento del Modelo y Métricas Iniciales

Como regla general, los algoritmos de ML basados en árboles no requieren de normalización/scaling de datos numéricos, pero sí requieren de codificación de las variables categoricas. La decisión de qué tipo de codificación usar se basó en la documentación de Gradient Boosting Trees de scikit-learn, donde se obtuvo que la codificación ordinal nativa de HistGradientBoosting es superior en cuanto a error medio y tiempo de entrenamiento en comparación con otros tipos de codificación como One Hot. (https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html)

In [10]:
est = CreateAndTestRegressor(X_train, X_test, y_train, y_test)

Performance Metrics on Test Set:
Mean Squared Error (MSE): 5.8924
Mean Absolute Error (MAE): 0.9843
R-squared (R^2): 0.1067

Performance Metrics on Train Set:
Mean Squared Error (MSE): 5.7957
Mean Absolute Error (MAE): 0.9807
R-squared (R^2): 0.1145


### Ajuste de Hiperparámetros

In [13]:
est = TuneAndTestRegressor(X_train, X_test, y_train, y_test)

Best Parameters For Regressor: {'learning_rate': 0.2, 'loss': 'poisson', 'max_depth': 5, 'max_iter': 300}

Performance Metrics on Test Set:
Mean Squared Error (MSE): 5.8282
Mean Absolute Error (MAE): 0.9640
R-squared (R^2): 0.1164

Performance Metrics on Train Set:
Mean Squared Error (MSE): 5.6599
Mean Absolute Error (MAE): 0.9557
R-squared (R^2): 0.1352


## Segunda Iteración de Modelo de Regresión

### Preparación y Separación del Conjunto de Datos

In [15]:
df_cp = df.copy()

df_cp.pop('municipio')
df_cp.pop('region')
df_cp = df_cp[df_cp['departamento'] == 'Guatemala']
df_cp.pop('departamento')

freqs = df_cp['dias_estancia'].value_counts()
df_cp = df_cp.loc[df['dias_estancia'] <= 30]

# Filter out the rows where the frequency is not equal to 1
uniques = freqs[freqs == 1].index
df_cp = df_cp[~df_cp['dias_estancia'].isin(uniques)]

In [16]:
# Separate the target variable 'dias_estancia' into y and the rest of the DataFrame into X
y = df_cp.pop('dias_estancia')
X = df_cp

print(X.shape)
print(y.shape)

(1190137, 7)
(1190137,)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print(X_train.shape)
print(y_train.shape)

(952109, 7)
(952109,)


### Entrenamiento del Modelo y Métricas Iniciales

In [18]:
est = CreateAndTestRegressor(X_train, X_test, y_train, y_test)

Performance Metrics on Test Set:
Mean Squared Error (MSE): 4.4596
Mean Absolute Error (MAE): 0.9525
R-squared (R^2): 0.1296

Performance Metrics on Train Set:
Mean Squared Error (MSE): 4.5442
Mean Absolute Error (MAE): 0.9572
R-squared (R^2): 0.1362


### Tuneo de Hiperparámetros

In [19]:
est = TuneAndTestRegressor(X_train, X_test, y_train, y_test)

Best Parameters For Regressor: {'learning_rate': 0.1, 'loss': 'poisson', 'max_depth': 10, 'max_iter': 300}

Performance Metrics on Test Set:
Mean Squared Error (MSE): 4.4266
Mean Absolute Error (MAE): 0.9385
R-squared (R^2): 0.1360

Performance Metrics on Train Set:
Mean Squared Error (MSE): 4.4760
Mean Absolute Error (MAE): 0.9401
R-squared (R^2): 0.1492
