# Feature Elimination

## 1- Initial Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing
from Utils.FeatureEliminators.VarianceEliminator import VarianceEliminator
from Utils.FeatureEliminators.CorrelationEliminator import CorrelationEliminator
from Utils.FeatureEliminators.LassoEliminator import LassoEliminator
from Utils.FeatureEliminators.MutualInformationEliminator import MutualInformationEliminator

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]

In [5]:
# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Preprocess the data
preprocessor = Preprocessor(normalize=False, one_hot_encode=True)

X_train = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=preprocessor.get_column_names())

X_valid = preprocessor.transform(X_valid)
X_valid = pd.DataFrame(X_valid, columns=preprocessor.get_column_names())

y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

# Reminder: Linear Regression Baseline
Linear Regression: \
Train mean squared error:  0.5175135945021986 \
Validation mean squared error:  0.51911678407925


# Eliminate High Correlation Features

In [7]:
# Already computed the correlation matrix before using np. 
correlation_matrix = X_train.corr()

In [8]:
# Identify features to keep based on the algorithm
selected_features = set(X_train.columns)

for col in correlation_matrix.columns:
    if col in selected_features:
        # Find features highly correlated with the current feature
        correlated_features = correlation_matrix.index[
            correlation_matrix[col].abs() > 0.8
        ].tolist()

        if len(correlated_features) > 1:
            # Compare their absolute correlation with the target variable
            best_feature = max(correlated_features, key=lambda feature: abs(X_train[feature].corr(y_train)))
            
            # Remove all except the best feature from the set
            selected_features -= set(correlated_features) - {best_feature}

# Convert selected_features to a list for indexing
selected_features = list(selected_features)
print(len(selected_features))

386


In [9]:
# keep only the selected features
X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

## Method 1: Variance Thresholding

#### Method 1 version 1

In [10]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.01)

selected_features = variance_eliminator.get_feature_indices()
variance_1_mask = variance_eliminator.get_feature_mask()

X_train_var_1 = X_train.iloc[:, selected_features]
X_test_var_1 = X_valid.iloc[:, selected_features]

In [11]:
# to numpy array
X_train_var_1 = X_train_var_1.to_numpy()
X_test_var_1 = X_test_var_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_1, y_train)

train_pred = lr_model.predict(X_train_var_1)
test_pred = lr_model.predict(X_test_var_1)

print("Linear Regression: ")
print("Number of used featues: ", len(selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  246
Train mean squared error:  0.6242887537202013
Validation mean squared error:  0.6243533341402387


#### Method 1 version 2

In [12]:
variance_eliminator = VarianceEliminator(X_train, y_train, threshold=0.03)

variance_2_selected_features = variance_eliminator.get_feature_indices()
variance_2_mask = variance_eliminator.get_feature_mask()

X_train_var_2 = X_train.iloc[:, variance_2_selected_features]
X_test_var_2 = X_valid.iloc[:, variance_2_selected_features]

In [13]:
# to numpy array
X_train_var_2 = X_train_var_2.to_numpy()
X_test_var_2 = X_test_var_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_var_2, y_train)

train_pred = lr_model.predict(X_train_var_2)
test_pred = lr_model.predict(X_test_var_2)

print("Linear Regression: ")
print("Number of used featues: ", len(variance_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  200
Train mean squared error:  0.6242887537202013
Validation mean squared error:  0.6243533341402387


## Method 2: Correlation Thresholding

#### Method 2 Version 1

In [14]:
correlation_eliminator_1 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.01)

corr_1_selected_features = correlation_eliminator_1.get_feature_indices()
corr_1_mask = correlation_eliminator_1.get_feature_mask()

X_train_corr_1 = X_train.iloc[:, corr_1_selected_features]
X_test_corr_1 = X_valid.iloc[:, corr_1_selected_features]

In [15]:
# to numpy array
X_train_corr_1 = X_train_corr_1.to_numpy()
X_test_corr_1 = X_test_corr_1.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_1, y_train)

train_pred = lr_model.predict(X_train_corr_1)
test_pred = lr_model.predict(X_test_corr_1)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_1_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  192
Train mean squared error:  0.5367566287134902
Validation mean squared error:  0.538758615168941


#### Method 2 Version 2

In [16]:
correlation_eliminator_2 = CorrelationEliminator(X_train, y_train, correlation_threshold=0.03)

corr_2_selected_features = correlation_eliminator_2.get_feature_indices()
corr_2_mask = variance_eliminator.get_feature_mask()

X_train_corr_2 = X_train.iloc[:, corr_2_selected_features]
X_test_corr_2 = X_valid.iloc[:, corr_2_selected_features]

In [17]:
# to numpy array
X_train_corr_2 = X_train_corr_2.to_numpy()
X_test_corr_2 = X_test_corr_2.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_corr_2, y_train)

train_pred = lr_model.predict(X_train_corr_2)
test_pred = lr_model.predict(X_test_corr_2)

print("Linear Regression: ")
print("Number of used featues: ", len(corr_2_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  59
Train mean squared error:  0.5445587904045704
Validation mean squared error:  0.5469917126963312


## Method 3: Lasso Eliminator

In [18]:
lasso_eliminator = LassoEliminator(X_train, y_train, l1=0.1, threshold=0.02)

lasso_selected_features = lasso_eliminator.get_feature_indices()
lasso_mask = variance_eliminator.get_feature_mask()

X_train_lasso = X_train.iloc[:, lasso_selected_features]
X_test_lasso = X_valid.iloc[:, lasso_selected_features]

[ 1.69805389e-01  2.03949171e-02  3.96465439e+00  2.90878348e-01
  3.27306547e-02  4.59094201e+00  3.01376723e+00  6.58437046e+00
  4.96482188e+00  7.17253834e+00  8.51916751e+00  4.98510124e+00
  2.78335847e-02  6.65722053e-03  2.04861943e+00  1.00872846e-01
  4.26635937e+01  5.00537957e-03  1.97903324e+01  1.49331964e-02
  3.56773624e-02  4.40287334e-02  2.42826327e+00  2.75116279e-01
  8.94380098e+03  3.52659293e+01  5.61549966e-02  8.50019143e-02
  1.15474676e+00  2.06801190e-01  2.80266349e+00  5.21190577e-02
  9.25653157e-03  7.81407345e-03  3.20719160e-02  7.45465314e-01
  2.46264432e-02  1.31089920e-01  2.43169438e-02  3.01967421e+00
  4.20489483e-01  2.47055319e+00  3.15967750e+00  1.19322714e-02
  3.70582541e+00  1.26564783e-01  5.87704224e-01  9.86894890e-01
  3.04424869e-01  2.50865392e+00  1.88374382e+00  1.93602816e-01
  1.55095764e-01  2.05889346e-02  1.48107955e-02  6.97740662e-01
  4.76131396e-02  1.99352955e-02  9.11313114e-02  5.80534494e+02
  2.46143608e+00  6.12389

In [19]:
# to numpy array
X_train_lasso = X_train_lasso.to_numpy()
X_test_lasso = X_test_lasso.to_numpy()

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_lasso, y_train)

train_pred = lr_model.predict(X_train_lasso)
test_pred = lr_model.predict(X_test_lasso)

print("Linear Regression: ")
print("Number of used featues: ", len(lasso_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  316
Train mean squared error:  0.6242887537202013
Validation mean squared error:  0.6243533341402387


## Method 4: Mutual information eliminator

In [20]:
mutual_info_eliminator = MutualInformationEliminator(X_train, y_train, threshold=0.01)

mutual_info_selected_features = mutual_info_eliminator.get_feature_indices()
mutual_info_mask = mutual_info_eliminator.get_feature_mask()

In [21]:
# to numpy array
X_train_mutual_info = X_train.iloc[:, mutual_info_selected_features]
X_test_mutual_info = X_valid.iloc[:, mutual_info_selected_features]

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_mutual_info, y_train)

train_pred = lr_model.predict(X_train_mutual_info)
test_pred = lr_model.predict(X_test_mutual_info)

print("Linear Regression: ")
print("Number of used featues: ", len(mutual_info_selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  26
Train mean squared error:  0.5562532752314526
Validation mean squared error:  0.5582367407746373


## Combined Feature Elimination

In [30]:
# majority voting for feature selection
# mutual_info_mask have 2 vote
# lasso_mask have 1 vote
# corr_2_mask have 1 vote
# variance_2_mask have 1 vote
# vote > 2 -> selected

selected_features = []
for i in range(len(X_train.columns)):
    vote = 0
    if mutual_info_mask[i]:
        vote += 3
    if lasso_mask[i]:
        vote += 1
    if corr_2_mask[i]:
        vote += 1
    if variance_2_mask[i]:
        vote += 1
    if vote > 2:
        selected_features.append(i)

selected_features = np.array(selected_features)
selected_features.shape

  if lasso_mask[i]:
  if corr_2_mask[i]:
  if variance_2_mask[i]:


(202,)

In [31]:
# to numpy array
X_train_final= X_train.iloc[:, selected_features]
X_test_final = X_valid.iloc[:, selected_features]

lr_model = LinearRegression(fit_method="ols", loss_function="rmse")

lr_model.fit(X_train_final, y_train)

train_pred = lr_model.predict(X_train_final)
test_pred = lr_model.predict(X_test_final)

print("Linear Regression: ")
print("Number of used featues: ", len(selected_features))
print("Train mean squared error: ", root_mean_squared_error(y_train, train_pred))
print("Validation mean squared error: ", root_mean_squared_error(y_valid, test_pred))

Linear Regression: 
Number of used featues:  202
Train mean squared error:  0.6242887537202013
Validation mean squared error:  0.6243533341402387
