# Linear Regression Baseline

## Preprocess Data

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.DecisionTreeRegressor import DecisionTreeRegressor
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

## 1- Linear Regression

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)

X_train_p = preprocessor.fit_transform(X_train)
X_valid_p = preprocessor.transform(X_valid)
y_train_p = y_train.to_numpy()
y_valid_p = y_valid.to_numpy()

In [7]:
lr_model = DecisionTreeRegressor(min_samples_split=5000, max_depth= 10)

lr_model.fit(X_train_p, y_train_p)

train_pred = lr_model.predict(X_train_p)
test_pred = lr_model.predict(X_valid_p)

print("Decision Tree Regression: ")
print("Train mean squared error: ", root_mean_squared_error(y_train_p, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid_p, test_pred))

Decision Tree Regression: 
Train mean squared error:  0.5003802678696453
Validation mean squared error:  0.5000932387613501
