In [10]:
import pandas as pd
data = pd.read_csv("/Users/ethanschultz/Documents/GSB 544/Week-7/Hitters.csv")
data_clean = data.dropna()

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


In [12]:
X = data_clean.drop("Salary", axis = 1)
y = data_clean["Salary"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

In [27]:

# Step 1: Identify numeric and categorical columns
numeric_features = ["AtBat", "Hits", "HmRun", "Runs", "RBI", "Walks", "Years", 
                    "CAtBat", "CHits", "CHmRun", "CRuns", "CRBI", "CWalks", 
                    "PutOuts", "Assists", "Errors"]
categorical_features = ["League", "Division", "NewLeague"]

# Step 2: Set up preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Step 3: Create pipeline with preprocessor and regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('linear regression', LinearRegression())
])

# Step 4: Fit model to training data (ensure X_train and y_train are defined)
pipeline.fit(X_train, y_train)

# Print model intercept
print("Intercept:", pipeline.named_steps['linear regression'].intercept_)

# Get feature names after transformation
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame to display coefficients with their feature names
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': pipeline.named_steps['linear regression'].coef_
})
print(coefficients_df)

# Step 5: Cross-validation for MSE estimation
mse_scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=5)
mean_mse = -np.mean(mse_scores)
print("Estimated MSE:", mean_mse)

Intercept: 543.0863384769973
             Feature  Coefficient
0         num__AtBat  -252.248587
1          num__Hits   278.259420
2         num__HmRun    89.959563
3          num__Runs   -33.121864
4           num__RBI   -57.122311
5         num__Walks   121.843818
6         num__Years    39.290831
7        num__CAtBat  -709.402255
8         num__CHits   265.874154
9        num__CHmRun  -105.094672
10        num__CRuns   466.340191
11         num__CRBI   409.194392
12       num__CWalks  -200.043156
13      num__PutOuts    93.019199
14      num__Assists    69.009659
15       num__Errors   -11.749752
16     cat__League_A   -14.580296
17     cat__League_N    14.580296
18   cat__Division_E    47.464125
19   cat__Division_W   -47.464125
20  cat__NewLeague_A     9.762672
21  cat__NewLeague_N    -9.762672
Estimated MSE: 121136.31031816886
Intercept: 543.0863384769973
             Feature  Coefficient
0         num__AtBat  -252.248587
1          num__Hits   278.259420
2         num__HmRun    

In [20]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Ridge regression with cross-validation to tune alpha
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])

# Define a range of alpha values for tuning
param_grid = {'ridge__alpha': [0.1, 1.0, 10, 100, 1000]}

# Use GridSearchCV for hyperparameter tuning
ridge_cv = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X, y)

# Extract best model and report MSE
ridge_best_model = ridge_cv.best_estimator_
ridge_mse = -ridge_cv.best_score_
print("Best alpha for Ridge:", ridge_cv.best_params_)
print("Estimated MSE for Ridge:", ridge_mse)

# Interpret coefficients of the best model
feature_names = ridge_best_model.named_steps['preprocessor'].get_feature_names_out()

# Print feature names alongside their coefficients
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': ridge_best_model.named_steps['ridge'].coef_
})
coefficients_df

Best alpha for Ridge: {'ridge__alpha': 1.0}
Estimated MSE for Ridge: 119144.43267691608
Best alpha for Ridge: {'ridge__alpha': 1.0}
Estimated MSE for Ridge: 119144.43267691608


Unnamed: 0,Feature,Coefficient
0,num__AtBat,-270.686441
1,num__Hits,296.64505
2,num__HmRun,18.100592
3,num__Runs,-29.339406
4,num__RBI,-9.113295
5,num__Walks,124.407173
6,num__Years,-38.667748
7,num__CAtBat,-225.406548
8,num__CHits,126.659607
9,num__CHmRun,39.070924


In [29]:
from sklearn.linear_model import Lasso

# Step 1: Create Lasso pipeline
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', Lasso(max_iter=10000))
])

# Define a range of alpha values for tuning
param_grid = {'lasso__alpha': [0.001, 0.01, 0.1, 1.0, 10]}

# Use GridSearchCV for hyperparameter tuning
lasso_cv = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_cv.fit(X, y)

# Extract best model and report MSE
lasso_best_model = lasso_cv.best_estimator_
lasso_mse = -lasso_cv.best_score_
print("Best alpha for Lasso:", lasso_cv.best_params_)
print("Estimated MSE for Lasso:", lasso_mse)

# Step 2: Interpret coefficients of the best model
# Get feature names after transformation
feature_names = lasso_best_model.named_steps['preprocessor'].get_feature_names_out()

# Create a DataFrame to display coefficients with their feature names
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': lasso_best_model.named_steps['lasso'].coef_
})
print(coefficients_df)

Best alpha for Lasso: {'lasso__alpha': 1.0}
Estimated MSE for Lasso: 119758.22781528854
             Feature   Coefficient
0         num__AtBat -2.823696e+02
1          num__Hits  3.043583e+02
2         num__HmRun  1.112715e+01
3          num__Runs -2.496605e+01
4           num__RBI -0.000000e+00
5         num__Walks  1.206948e+02
6         num__Years -3.494751e+01
7        num__CAtBat -1.626441e+02
8         num__CHits  0.000000e+00
9        num__CHmRun  1.422286e+01
10        num__CRuns  3.755650e+02
11         num__CRBI  1.926164e+02
12       num__CWalks -1.896431e+02
13      num__PutOuts  7.876026e+01
14      num__Assists  4.199666e+01
15       num__Errors -1.847942e+01
16     cat__League_A -3.582608e+01
17     cat__League_N  8.673601e-15
18   cat__Division_E  1.144130e+02
19   cat__Division_W -2.055233e-11
20  cat__NewLeague_A  0.000000e+00
21  cat__NewLeague_N -0.000000e+00
Best alpha for Lasso: {'lasso__alpha': 1.0}
Estimated MSE for Lasso: 119758.22781528854
             Featur

In [30]:
from sklearn.linear_model import ElasticNet

elastic_net_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('elasticnet', ElasticNet(max_iter=10000))
])

# Define a grid for alpha and l1_ratio values
param_grid = {
    'elasticnet__alpha': [0.001, 0.01, 0.1, 1.0, 10],
    'elasticnet__l1_ratio': [0.2, 0.5, 0.8]
}

# Use GridSearchCV for hyperparameter tuning
elastic_net_cv = GridSearchCV(elastic_net_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
elastic_net_cv.fit(X, y)

# Extract best model and report MSE
elastic_net_best_model = elastic_net_cv.best_estimator_
elastic_net_mse = -elastic_net_cv.best_score_
print("Best alpha and l1_ratio for Elastic Net:", elastic_net_cv.best_params_)
print("Estimated MSE for Elastic Net:", elastic_net_mse)

# Step 2: Interpret coefficients of the best model
# Get feature names after transformation
feature_names = elastic_net_best_model.named_steps['preprocessor'].get_feature_names_out()

# Create a DataFrame to display coefficients with their feature names
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': elastic_net_best_model.named_steps['elasticnet'].coef_
})

coefficients_df


Best alpha and l1_ratio for Elastic Net: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.8}
Estimated MSE for Elastic Net: 118994.44721070088
Best alpha and l1_ratio for Elastic Net: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.8}
Estimated MSE for Elastic Net: 118994.44721070088


Unnamed: 0,Feature,Coefficient
0,num__AtBat,-184.994814
1,num__Hits,200.550645
2,num__HmRun,-3.276012
3,num__Runs,9.357995
4,num__RBI,9.378484
5,num__Walks,97.55504
6,num__Years,-52.179396
7,num__CAtBat,-56.856299
8,num__CHits,110.750979
9,num__CHmRun,59.255198
