# Linear Regression Baseline

## Preprocess Data

In [7]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing
from Utils.CrossValidation import KFoldCrossValidation
from Utils.Pipeline import Pipeline

In [8]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [9]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [10]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
# clip the dataset
X_train = X_train[0:50000]
y_train = y_train[0:50000]

## 1- Linear Regression

In [12]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)
lr_model = LinearRegression(fit_method="ols", loss_function="rmse")
pipeline = Pipeline(preprocessor, lr_model)
croos_validator = KFoldCrossValidation(pipeline, X_train, y_train, k=5)

print("Linear Regression 5 Fold Cross Validation: ")
print("Cross Validation mean root mean squared error: ",  croos_validator.mean_score())

Linear Regression: 
Cross Validation mean root mean squared error:  0.5226248339668308


## 2- Lasso Regression

In [None]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)
lr_model = LinearRegression(fit_method="gd", loss_function="rmse", l1=0.01, learning_rate=0.01, epochs=10, min_step_size=0.001, gradient_descent='batch')
pipeline = Pipeline(preprocessor, lr_model)
croos_validator = KFoldCrossValidation(pipeline, X_train, y_train, k=5)

print("Lasso Linear Regression 5 Fold Cross Validation: ")
print("Cross Validation mean root mean squared error: ",  croos_validator.mean_score())

Lasso Linear Regression: 
Cross Validation mean root mean squared error:  0.6207939859987704


## 3- Ridge Regression

In [None]:
preprocessor = Preprocessor(normalize=True, standardize=False, one_hot_encode=True)
lr_model = LinearRegression(fit_method="ols", loss_function="rmse", l2=0.01)
pipeline = Pipeline(preprocessor, lr_model)
croos_validator = KFoldCrossValidation(pipeline, X_train, y_train, k=5)

print("Ridge Linear Regression 5 Fold Cross Validation: ")
print("Cross Validation mean root mean squared error: ",  croos_validator.mean_score())

Ridge Linear Regression: 
Cross Validation mean root mean squared error:  0.5231208921777183
