# Gradient Boosting Final

## Preprocess Data

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.GradientBoostingRegressor import GradientBoostingRegressor
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing, feature_elimination

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)
train = feature_elimination(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)

X_train_p = preprocessor.fit_transform(X_train)
X_valid_p = preprocessor.transform(X_valid)

y_train_p = y_train.to_numpy()
y_valid_p = y_valid.to_numpy()

## 1- Gradient Boosting

Randomized CV result for Gradient Boosting

Best Params: [50, np.float64(0.01), np.int64(5), np.float64(0.020000000000000004)]

Best Score: 0.539268

In [None]:
gb_model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=10, max_depth=5, min_samples_split=int(0.02 * X_train_p.shape[0]))

gb_model.fit(X_train_p, y_train_p)

train_pred = gb_model.predict(X_train_p)
test_pred = gb_model.predict(X_valid_p)

print("Linear Regression: ")
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))