# Gradient Boosting Regression Baseline

## Preprocess Data

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.GradientBoostingRegressor import GradientBoostingRegressor
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

KeyboardInterrupt: 

In [12]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

## 1- Gradient Boosting

In [13]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)

X_train_p = preprocessor.fit_transform(X_train)
X_valid_p = preprocessor.transform(X_valid)

# clip the dataset
X_train_p = X_train_p[0:50000]
y_train_p = y_train[0:50000]

y_train_p = y_train_p.to_numpy()
y_valid_p = y_valid.to_numpy()

In [16]:
gb_model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, min_samples_split=1000, max_depth=7)

In [17]:
gb_model.fit(X_train_p, y_train_p)

In [18]:
train_pred = gb_model.predict(X_train_p)
test_pred = gb_model.predict(X_valid_p)

print("Gradient Boosting Regression: ")
print("Train mean squared error: ", root_mean_squared_error(y_train_p, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid_p, test_pred))

Gradient Boosting Regression: 
Train mean squared error:  0.42737335599550974
Validation mean squared error:  0.43644783769078244
