# Feature Elimination

## 1- Initial Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing
from Utils.FeatureEliminators.VarianceEliminator import VarianceEliminator
from Utils.FeatureEliminators.CorrelationEliminator import CorrelationEliminator
from Utils.FeatureEliminators.LassoEliminator import LassoEliminator

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [5]:
# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Preprocess the data
preprocessor = Preprocessor(normalize=True, one_hot_encode=True)

X_train = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=preprocessor.get_column_names())

X_valid = preprocessor.transform(X_valid)
X_valid = pd.DataFrame(X_valid, columns=preprocessor.get_column_names())

y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

# Reminder: Linear Regression Baseline
Linear Regression: \
Train mean squared error:  0.5175135945021986 \
Validation mean squared error:  0.51911678407925


## Method 1: Variance Thresholding

#### Method 1 version 1

In [7]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.01)

selected_features = variance_eliminator.get_feature_indices()
variance_1_mask = variance_eliminator.get_feature_mask()

X_train_var_1 = X_train.iloc[:, selected_features]
X_test_var_1 = X_valid.iloc[:, selected_features]

In [8]:
# to numpy array
X_train_var_1 = X_train_var_1.to_numpy()
X_test_var_1 = X_test_var_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_1, y_train)

train_pred = lr_model.predict(X_train_var_1)
test_pred = lr_model.predict(X_test_var_1)

print("Linear Regression: ")
print("Number of used featues: ", len(selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  336
Train mean squared error:  0.5306881182466742
Validation mean squared error:  0.5315929002975792


#### Method 1 version 2

In [9]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.1)

variance_2_selected_features = variance_eliminator.get_feature_indices()
variance_2_mask = variance_eliminator.get_feature_mask()

X_train_var_2 = X_train.iloc[:, variance_2_selected_features]
X_test_var_2 = X_valid.iloc[:, variance_2_selected_features]

In [10]:
# to numpy array
X_train_var_2 = X_train_var_2.to_numpy()
X_test_var_2 = X_test_var_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_2, y_train)

train_pred = lr_model.predict(X_train_var_2)
test_pred = lr_model.predict(X_test_var_2)

print("Linear Regression: ")
print("Number of used featues: ", len(variance_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  135
Train mean squared error:  0.5995919481377527
Validation mean squared error:  0.6004933618410249


## Method 2: Correlation Thresholding

#### Method 2 Version 1

In [11]:
correlation_eliminator_1 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.01)

corr_1_selected_features = correlation_eliminator_1.get_feature_indices()
corr_1_mask = variance_eliminator.get_feature_mask()

X_train_corr_1 = X_train.iloc[:, corr_1_selected_features]
X_test_corr_1 = X_valid.iloc[:, corr_1_selected_features]

In [12]:
# to numpy array
X_train_corr_1 = X_train_corr_1.to_numpy()
X_test_corr_1 = X_test_corr_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_1, y_train)

train_pred = lr_model.predict(X_train_corr_1)
test_pred = lr_model.predict(X_test_corr_1)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_1_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  269
Train mean squared error:  0.5324470589299449
Validation mean squared error:  0.5340079095469382


#### Method 2 Version 2

In [13]:
correlation_eliminator_2 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.03)

corr_2_selected_features = correlation_eliminator_2.get_feature_indices()
corr_2_mask = variance_eliminator.get_feature_mask()

X_train_corr_2 = X_train.iloc[:, corr_2_selected_features]
X_test_corr_2 = X_valid.iloc[:, corr_2_selected_features]

In [14]:
# to numpy array
X_train_corr_2 = X_train_corr_2.to_numpy()
X_test_corr_2 = X_test_corr_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_2, y_train)

train_pred = lr_model.predict(X_train_corr_2)
test_pred = lr_model.predict(X_test_corr_2)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  76
Train mean squared error:  0.5435803901515012
Validation mean squared error:  0.5459862384342256


## Method 3: Lasso Eliminator

In [27]:
lasso_eliminator = LassoEliminator(X_train, y_train, l1=1, threshold=1e-1)

lasso_selected_features = lasso_eliminator.get_feature_indices()
lasso_mask = variance_eliminator.get_feature_mask()

X_train_lasso = X_train.iloc[:, lasso_selected_features]
X_test_lasso = X_valid.iloc[:, lasso_selected_features]

[-2.85036299e-02 -3.74651419e-02 -5.15764031e-02 -1.72190805e-01
 -2.40563418e-01 -1.38952880e-02 -1.93461113e-01 -1.19955814e-01
 -8.06209364e-02 -1.97545871e-01 -9.07825729e-02 -2.50226382e-01
 -1.06739467e-01 -8.37862165e-02 -4.60427213e-02 -6.19663740e-02
 -1.50473780e-01 -8.47370890e-02 -4.34359747e-02 -3.95762857e-02
 -7.77021546e-02 -4.91179241e-02 -2.00315561e-01 -9.32744169e-02
 -1.19097719e-01 -1.33614361e-01 -9.41745801e-02 -7.37853222e-02
 -5.79564366e-02 -2.81352860e-02 -1.71525566e-01 -7.41492203e-03
 -1.20348092e-01 -5.14040282e-02 -2.45440640e-03 -4.53417243e-05
 -1.36530249e-01 -1.13911056e-01 -1.65215537e-01 -1.39544671e-01
 -4.27981109e-02 -1.56968018e-01 -8.27118809e-02 -1.24678468e-01
 -1.24100653e-01 -1.32376008e-01 -9.66637144e-02 -2.33290195e-01
 -1.63326105e-02 -6.71271760e-02 -6.51948788e-02 -1.19597969e-01
 -1.90442437e-01 -3.23848321e-02 -6.74616565e-02 -1.38042307e-01
 -1.07623891e-01 -1.62338466e-01 -8.71742091e-02 -1.69414275e-01
 -1.05679898e-01 -1.15488

In [28]:
# to numpy array
X_train_lasso = X_train_lasso.to_numpy()
X_test_lasso = X_test_lasso.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_lasso, y_train)

train_pred = lr_model.predict(X_train_lasso)
test_pred = lr_model.predict(X_test_lasso)

print("Linear Regression: ")
print("Number of used featues: ", len(lasso_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  346
Train mean squared error:  0.5340242905222083
Validation mean squared error:  0.5358824169618749
