# Gradient Boost com XGBoost

## Importando bibliotecas essenciais e dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset_train = pd.read_csv("data/train_processed.csv")
dataset_test = pd.read_csv("data/test_processed.csv")

In [3]:
dataset_train.head(5)

Unnamed: 0,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,...,Arrival Delay in Minutes,Jovem,Adulto,Idoso,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,0.086632,0.6,0.8,0.6,0.2,1.0,0.6,1.0,1.0,0.8,...,0.5625,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.041195,0.6,0.4,0.6,0.6,0.2,0.6,0.2,0.2,0.2,...,0.1875,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.224354,0.4,0.4,0.4,0.4,1.0,1.0,1.0,1.0,0.8,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.107229,0.4,1.0,1.0,1.0,0.4,0.4,0.4,0.4,0.4,...,0.28125,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.036955,0.6,0.6,0.6,0.6,0.8,1.0,1.0,0.6,0.6,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [4]:
dataset_test.head(5)

Unnamed: 0,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,...,Arrival Delay in Minutes,Jovem,Adulto,Idoso,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,0.02605,1.0,0.8,0.6,0.8,0.6,0.8,0.6,1.0,1.0,...,1.375,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.57189,0.2,0.2,0.6,0.2,1.0,0.8,1.0,0.8,0.8,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.032512,0.4,0.0,0.4,0.8,0.4,0.4,0.4,0.4,0.8,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,0.675687,0.0,0.0,0.0,0.4,0.6,0.8,0.8,0.2,0.2,...,0.1875,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.232431,0.4,0.6,0.8,0.6,0.8,0.2,0.4,0.4,0.4,...,0.625,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [5]:
from sklearn.model_selection import train_test_split

X_train = dataset_train.drop("satisfaction_satisfied",axis=1)
y_train = dataset_train["satisfaction_satisfied"]

X_train, X_Val, y_train, y_Val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

X_test = dataset_test.drop("satisfaction_satisfied",axis=1)
y_test = dataset_test["satisfaction_satisfied"]

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

param_grid = {
    'loss': ['log_loss', 'exponential'],  # função de perda
    'learning_rate': [0.01, 0.1, 0.2],  # taxa de aprendizado
    'n_estimators': [100, 200, 300],  # número de árvores
    'subsample': [0.8, 1.0],  # fração de amostras para treinar cada árvore
    'criterion': ['friedman_mse', "mse", "mae" ],  # critério para melhorar a qualidade do split
    'min_samples_leaf': [1, 2, 4],  # número mínimo de amostras por folha
    'max_depth': [3, 5, 10]  # profundidade máxima das árvores
}

clf = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor pontuação de validação cruzada (acurácia):", grid_search.best_score_)

# Testar no conjunto de teste
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia no conjunto de teste:", test_accuracy)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.8; total time=  10.2s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.8; total time=   7.4s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.8; total time=   7.7s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=   8.4s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=   8.1s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=  12.3s
[CV] END criterion

: 

In [None]:
%pip install xgboost

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\rubin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Definindo o modelo
xgb = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Definindo a grade de parâmetros
param_grid = {
    # o xgboost não possui uma equivalência para o criterion, Min_samples_leaf, 
    'eta': [0.01, 0.1, 0.2], # equivalente à Learning Rate do GradientBoost
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 2, 4],
    'max_depth': [3, 5, 7]
}

# Configurando GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

# Mostrando os melhores parâmetros e a melhor pontuação
print("Melhores parâmetros encontrados:", grid_search.best_params_)
print("Melhor pontuação de validação cruzada (acurácia):", grid_search.best_score_)


Fitting 3 folds for each of 162 candidates, totalling 486 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Melhores parâmetros encontrados: {'eta': 0.1, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.8}
Melhor pontuação de validação cruzada (acurácia): 0.9607586490093419
