# Feature Elimination

## 1- Initial Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing
from Utils.FeatureEliminators.VarianceEliminator import VarianceEliminator
from Utils.FeatureEliminators.CorrelationEliminator import CorrelationEliminator
from Utils.FeatureEliminators.LassoEliminator import LassoEliminator


In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [5]:
# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Preprocess the data
preprocessor = Preprocessor(normalize=True, one_hot_encode=True)

X_train = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=preprocessor.get_column_names())

X_valid = preprocessor.transform(X_valid)
X_valid = pd.DataFrame(X_valid, columns=preprocessor.get_column_names())

y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

# Reminder: Linear Regression Baseline
Linear Regression: \
Train mean squared error:  0.5175135945021986 \
Validation mean squared error:  0.51911678407925


## Method 1: Variance Thresholding

#### Method 1 version 1

In [7]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.01)

selected_features = variance_eliminator.get_feature_indices()
variance_1_mask = variance_eliminator.get_feature_mask()

X_train_var_1 = X_train.iloc[:, selected_features]
X_test_var_1 = X_valid.iloc[:, selected_features]

In [8]:
# to numpy array
X_train_var_1 = X_train_var_1.to_numpy()
X_test_var_1 = X_test_var_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_1, y_train)

train_pred = lr_model.predict(X_train_var_1)
test_pred = lr_model.predict(X_test_var_1)

print("Linear Regression: ")
print("Number of used featues: ", len(selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  336
Train mean squared error:  0.5306881182466742
Validation mean squared error:  0.5315929002975792


#### Method 1 version 2

In [9]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.1)

variance_2_selected_features = variance_eliminator.get_feature_indices()
variance_2_mask = variance_eliminator.get_feature_mask()

X_train_var_2 = X_train.iloc[:, variance_2_selected_features]
X_test_var_2 = X_valid.iloc[:, variance_2_selected_features]

In [10]:
# to numpy array
X_train_var_2 = X_train_var_2.to_numpy()
X_test_var_2 = X_test_var_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_2, y_train)

train_pred = lr_model.predict(X_train_var_2)
test_pred = lr_model.predict(X_test_var_2)

print("Linear Regression: ")
print("Number of used featues: ", len(variance_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  135
Train mean squared error:  0.5995919481377527
Validation mean squared error:  0.6004933618410249


## Method 2: Correlation Thresholding

#### Method 2 Version 1

In [11]:
correlation_eliminator_1 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.01)

corr_1_selected_features = correlation_eliminator_1.get_feature_indices()
corr_1_mask = variance_eliminator.get_feature_mask()

X_train_corr_1 = X_train.iloc[:, corr_1_selected_features]
X_test_corr_1 = X_valid.iloc[:, corr_1_selected_features]

In [12]:
# to numpy array
X_train_corr_1 = X_train_corr_1.to_numpy()
X_test_corr_1 = X_test_corr_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_1, y_train)

train_pred = lr_model.predict(X_train_corr_1)
test_pred = lr_model.predict(X_test_corr_1)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_1_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  269
Train mean squared error:  0.5324470589299449
Validation mean squared error:  0.5340079095469382


#### Method 2 Version 2

In [13]:
correlation_eliminator_2 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.03)

corr_2_selected_features = correlation_eliminator_2.get_feature_indices()
corr_2_mask = variance_eliminator.get_feature_mask()

X_train_corr_2 = X_train.iloc[:, corr_2_selected_features]
X_test_corr_2 = X_valid.iloc[:, corr_2_selected_features]

In [14]:
# to numpy array
X_train_corr_2 = X_train_corr_2.to_numpy()
X_test_corr_2 = X_test_corr_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_2, y_train)

train_pred = lr_model.predict(X_train_corr_2)
test_pred = lr_model.predict(X_test_corr_2)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  76
Train mean squared error:  0.5435803901515012
Validation mean squared error:  0.5459862384342256


## Method 3: Lasso Eliminator

In [15]:
lasso_eliminator = LassoEliminator(X_train, y_train, l1=1, threshold=1e-1)

lasso_selected_features = lasso_eliminator.get_feature_indices()
lasso_mask = variance_eliminator.get_feature_mask()

X_train_lasso = X_train.iloc[:, lasso_selected_features]
X_test_lasso = X_valid.iloc[:, lasso_selected_features]

[ 0.13552179  0.02149179  0.11688832  0.16142764  0.14623113  0.01796757
  0.1113351   0.11391161  0.1923999   0.04881386  0.06050701  0.1018681
  0.13681556  0.11160901  0.07684827  0.11741258  0.01044977  0.11186196
  0.11483097  0.15622347  0.08205623  0.14635023  0.05739204  0.08710357
  0.07569954  0.09478641  0.1297759   0.18853078  0.1891266   0.14133835
  0.08463799  0.17882477  0.10489186  0.12998745  0.13694414  0.0698415
  0.00619916  0.15315166  0.02944366  0.13845117  0.07925201  0.09499829
  0.15445651  0.16561626  0.06081209  0.09665804  0.15203817  0.08992225
  0.15694487  0.12784079  0.15107906  0.12806633  0.08822729  0.10078291
  0.12364622  0.10764498  0.16030872  0.15049549  0.10799672  0.0659031
  0.08504493  0.08652476  0.08674783  0.10277922  0.08287242  0.10177741
  0.12764416  0.15273395  0.18068018  0.03388803  0.04657592  0.09750169
  0.11119865  0.16289446  0.1266719   0.06039922  0.20080504  0.11491901
  0.11263063  0.07832645  0.18133908  0.0340257   0.09

In [16]:
# to numpy array
X_train_lasso = X_train_lasso.to_numpy()
X_test_lasso = X_test_lasso.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_lasso, y_train)

train_pred = lr_model.predict(X_train_lasso)
test_pred = lr_model.predict(X_test_lasso)

print("Linear Regression: ")
print("Number of used featues: ", len(lasso_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  290
Train mean squared error:  0.5961747512502205
Validation mean squared error:  0.5960908241963463


## Method 4: Mutual information eliminator

In [8]:
from Utils.FeatureEliminators.MutualInformationEliminator import MutualInformationEliminator
mutual_info_eliminator = MutualInformationEliminator(X_train, y_train, threshold=0.01)

mutual_info_selected_features = mutual_info_eliminator.get_feature_indices()
mutual_info_mask = mutual_info_eliminator.get_feature_mask()

In [10]:
# to numpy array
X_train_mutual_info = X_train.iloc[:, mutual_info_selected_features]
X_test_mutual_info = X_valid.iloc[:, mutual_info_selected_features]

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_mutual_info, y_train)

train_pred = lr_model.predict(X_train_mutual_info)
test_pred = lr_model.predict(X_test_mutual_info)

print("Linear Regression: ")
print("Number of used featues: ", len(mutual_info_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  50
Train mean squared error:  0.555158807269044
Validation mean squared error:  0.5569853990675582
