In [185]:
from sklearn.linear_model import lasso_path
from sklearn.linear_model import LinearRegression
import pandas as pd
import GA.GA as GA
import numpy as np

In [186]:
# Load your data with the appropriate delimiter (e.g., comma)
file_path = 'data/baseball.dat'
try:
    data = pd.read_csv(file_path, delim_whitespace=True)
except Exception as e:
    print("Error loading file with comma as delimiter:", e)

target_variable = 'salary'

In [187]:
variables = [col for col in data.columns if col != target_variable]
X = data[variables]
y = data[target_variable]

# Normalize data due to using LASSO
X_norm = X / X.std(axis=0)
y_norm = y / y.std(axis=0)

In [188]:
eps = 0.001
print("Computing regularization path using the lasso...")
alphas_lasso, coefs_lasso, _ = lasso_path(X_norm, y_norm, eps=eps)

Computing regularization path using the lasso...


In [233]:
# Note: now that we found variables selection with LASSO, the linear regression runs on the unormalized data to give accurate residuals.

# Find the linear reg model with the best AIC score:
lin_model_sel_var_AIC = np.zeros(coefs_lasso.shape[1])

for i in range(coefs_lasso.shape[1]):
    selected_variables = np.array(variable_list)[coefs_lasso[:,i]!=0]
    
    # Case: more than zero variables are selected
    if(len(selected_variables) != 0):
        lin_model_sel_var = LinearRegression().fit(X[selected_variables], y)
        
        prediction = lin_model_sel_var.predict(X[selected_variables])
        num_of_variables = len(selected_variables)
        lin_model_sel_var_AIC[i] = GA.AIC(y, prediction, num_of_variables)
    else:
        lin_model_sel_var_AIC[i] = np.inf

# Run the best model config again:
model_no_best_AIC = np.argmin(lin_model_sel_var_AIC)
selected_variables_best_AIC = np.array(variable_list)[coefs_lasso[:,model_no_best_AIC]!=0]
lin_model_sel_var_best_AIC = LinearRegression().fit(X[selected_variables], y)

print('Selected Variables for the Model with the best AIC:')
print(selected_variables_best_AIC)
print(len(selected_variables_best_AIC))

print('\nCoefficients:')
print(lin_model_sel_var_best_AIC.coef_)

print('\nAIC for the above model:')
print(np.min(lin_model_sel_var_AIC))


Selected Variables for the Model with the best AIC:
['obp' 'runs' 'homeruns' 'rbis' 'sos' 'freeagent' 'arbitration' 'hrsperso'
 'soserrors' 'sbsruns']
10

Coefficients:
[-5.10482056e+02  6.84859621e+00  1.72649442e+00 -1.82908407e+01
  2.75154345e+01  1.51104214e+01  7.79161502e+00 -1.20940203e+01
 -9.31388156e+00  1.27349272e+03  8.30232082e+02 -3.50884397e+01
 -3.29297428e+02 -2.44716855e+01 -1.04801270e-01  3.38279426e+01
  9.30128817e-03]

AIC for the above model:
4423.143405869015


In [234]:
# Extra Effort: we use a greedy algorithm starting with all factors and iteratively kicking out the factors with the smallest p-value.

import statsmodels.api as sm

X_new = sm.add_constant(X)
greedy_aic = np.zeros(X_new.shape[1]-1)
greedy_var_to_keep = []

for i in range(X_new.shape[1]-1):
    fitted_model = sm.OLS(y, X_new).fit()

    # Find highest p_val
    var_max_pval = np.argmax(fitted_model.pvalues)
    column_names_list = X_new.columns.tolist()
    greedy_var_to_keep.append(column_names_list)

    # Get Model AIC score
    greedy_aic[i] = GA.AIC(y,
                           fitted_model.predict(X_new),
                           len(column_names_list))

    # Optinize greedily: kick out variables with the highest p_val
    X_new = X_new.drop(column_names_list[var_max_pval], axis=1)

print('Selected Variables for the Greedy Model with the best AIC:')
print(greedy_var_to_keep[np.argmin(greedy_aic)])
print(len(greedy_var_to_keep[np.argmin(greedy_aic)]))

print('\nAIC for the best greedy model:')
print(np.min(greedy_aic))


Selected Variables for the Greedy Model with the best AIC:
['runs', 'hits', 'rbis', 'walks', 'sos', 'sbs', 'freeagent', 'arbitration', 'hitsperso', 'hrsperso', 'rbisperso', 'walksperso', 'hrspererror', 'soserrors', 'sbsobp']
15

AIC for the best greedy model:
4418.290953764176
