In [1]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
#data
baseball = pd.read_csv('data/Hitters.csv')

In [3]:
#Although it isn’t listed as a specific question, don’t forget to clean your data at the beginning. 
#How will you handle missing data?
#Are there any variables that need adjusting?

#clean data
#find number of NA's per column
na_counts = baseball.isna().sum()
print(na_counts)

#only NA's are salary
#for now will choose to remove the observations where salary is NA
baseball = baseball.dropna()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64
AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64


**Part I: Different Model Specs**

**A. Regression without regularization**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

2. Fit this pipeline to the full dataset, and interpret a few of the most important coefficients.

3. Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [4]:

#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_linear = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

In [5]:
# Fit and transform the data
X_1 = ct.fit_transform(X)

# Retrieve feature names
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Create a DataFrame with the transformed data
X_1_df = pd.DataFrame(X_1, columns=all_feature_names)
X_1_df.head()

#all columns are there

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
1,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,


**#2**

In [6]:
# Fit the pipeline to the full dataset
lr_pipeline_linear.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_linear.named_steps["linear_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df


Unnamed: 0,Feature,Coefficient
0,League_A,-31.299712
1,League_N,31.299712
2,Division_E,58.424623
3,Division_W,-58.424623
4,NewLeague_A,12.381163
5,NewLeague_N,-12.381163
6,AtBat,-291.094556
7,Hits,337.830479
8,HmRun,37.853837
9,Runs,-60.572479


Some of the most important coefficients in this case, we see that we have all the categorical options for all the dummy variables we converted. So in order to accurately estimate that we would find the total amount between the two groups.

For instance for league_N and league_A: A player's salary is estimated to make 31.299712+31.299712=62.599424 thousand MORE if they play in the A league versus the N league. This would work the same for the NewLeague dummy variable as well as the Division dummy variable.

The highest coefficients we see in our models are CRuns: 480, Hits: 337 and CAtBat: -391

CRuns: For every 1 standard deviation increase in number of home runs during a player's career, there is an estimated 480 (thousand) increase in that players salary

Hits: For every 1 standard deviation increase in the player's number of hits in 1986, there is an estimated 337 (thousand) increase in that players salary

CAtBat: For every 1 standard deviation increase in the player's number of times at bar during their career, there is an estimated -391 (thousand) DECREASE in that players salary

**#3**

In [7]:
#Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.3456645862518122  mse:  120656.57125044877
r2:  0.3456645862518122  mse:  120656.57125044877


**B. Ridge regression**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

2. Use cross-validation to tune the lambda hyperparameter.

3. Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [8]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = 1))]
).set_output(transform="pandas")

**#2**

In [9]:
#Use cross-validation to tune the hyperparameter.

#tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,ridge_regression__alpha,scores
5,100.0,0.385012
4,10.0,0.368328
3,1.0,0.355767
2,0.1,0.347675
1,0.01,0.344084
0,0.001,0.343556


**#3**

In [10]:
#Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_ridge = Pipeline(
  [("preprocessing", ct),
  #changed alpha = 100
  ("ridge_regression", Ridge(alpha = 100))]
).set_output(transform="pandas")

In [11]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_ridge.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_ridge.named_steps["ridge_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-11.051842
1,League_N,11.051842
2,Division_E,38.023222
3,Division_W,-38.023222
4,NewLeague_A,-4.09159
5,NewLeague_N,4.09159
6,AtBat,-0.56737
7,Hits,49.612386
8,HmRun,-1.464159
9,Runs,29.343263


The main changes we see are that a lot of the very large coefficients of variables have changed (specifically CRuns, Hits, and CAtBat). Additionally, the coefficient for CAtBat is now positive!

CRuns: For every 1 standard deviation increase in number of home runs during a player's career, there is an estimated 44.53 (thousand) increase in that players salary

Hits: For every 1 standard deviation increase in the player's number of hits in 1986, there is an estimated 49.61 (thousand) increase in that players salary

CAtBat: For every 1 standard deviation increase in the player's number of times at bar during their career, there is an estimated 24.70 (thousand) INCREASE in that players salary)

Overall most of the coefficients of our predictors are lower.

Some of the other most important coefficients now include PutOuts and CRBI

**#4**

In [12]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_ridge, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_ridge, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.3850122123151716  mse:  120716.43558937623
r2:  0.3850122123151716  mse:  120716.43558937623


**C. Lasso Regression**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

2. Use cross-validation to tune the alpha hyperparameter.

3. Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [13]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary lasso regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = 1))]
).set_output(transform="pandas")

**#2**

In [14]:
#Use cross-validation to tune the hyperparameter.

#tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,lasso_regression__alpha,scores
4,10.0,0.369523
3,1.0,0.354238
2,0.1,0.346041
1,0.01,0.344394
0,0.001,0.344223
5,100.0,0.298382


**#3**

In [15]:
#Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_lasso = Pipeline(
  [("preprocessing", ct),
  #changed alpha = 10
  ("lasso_regression", Lasso(alpha = 10))]
).set_output(transform="pandas")

In [16]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_lasso.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_lasso.named_steps["lasso_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-0.0
1,League_N,0.0
2,Division_E,95.4132
3,Division_W,-3.327948e-12
4,NewLeague_A,-0.0
5,NewLeague_N,0.0
6,AtBat,-0.0
7,Hits,88.74163
8,HmRun,0.0
9,Runs,0.0


**NEED TO REWRITE EXPLANATATION FOR THIS PART**

**#4**

In [17]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_lasso, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_lasso, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.3695225743376409  mse:  121828.14133338635
r2:  0.3695225743376409  mse:  121828.14133338635


**D. Elastic Net**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary elastic net regression

2. Use cross-validation to tune the lambda and alpha hyperparameters.

3. Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [18]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary elastic net regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elastic_regression", ElasticNet(alpha = .01, l1_ratio = .5))]
).set_output(transform="pandas")

**#2**

In [19]:
#Use cross-validation to tune the lambda and alpha hyperparameters.

#Use cross-validation to tune the hyperparameter.

#tune lambdas/alphas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_1, values, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False).head()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
28,1.0,0.2,0.386643
29,1.0,0.3,0.386557
27,1.0,0.1,0.386511
30,1.0,0.4,0.386203
31,1.0,0.5,0.385538


**#3**

In [20]:
#Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_elastic = Pipeline(
  [("preprocessing", ct),
  #can now put our custom alpha and l1_ratio that we found in the previous step
  ("elastic_regression", ElasticNet(alpha = 1, l1_ratio = .2))]
).set_output(transform="pandas")

In [21]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_elastic.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_elastic.named_steps["elastic_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-7.248569
1,League_N,7.248595
2,Division_E,26.103984
3,Division_W,-26.103971
4,NewLeague_A,-4.116523
5,NewLeague_N,4.116546
6,AtBat,12.179419
7,Hits,37.450668
8,HmRun,5.609999
9,Runs,27.011028


The elastic regression with parameters lambda = 1 and l1_ratio = .2 definitely has similiar coefficients to the previous steps (that is because the elastic regression uses a combination of two calculations to find its own penalty formula) Where that ratio of which type is more effective is determined by the l1_ratio.

Because 0 would mean a pure ridge regression and 1 would mean a pure lasso regression

that means that essentially 80% of the penalty weight is on a ridge penalty and 20% is on a lasso penalty. As a result we can see that are values are a lot more similiar to the ridge regression, (even though we used a completely different alpha value)

**Some of the most notable coefficient changes from the previous regression (lasso) are that CWalks has tripled. CRBI, CRuns, Walks, Hits, and CHits are all lower.    REWRITE**

Interpretation of CWalks output: For every 1 standard deviation increase in a players number of walks in 1986, their salary is expected to increase by 34.56 (thousand)

**#4**

In [22]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

#this combination has given us the highest R^2 and lowest MSE compared to all the previous regression with their corresponding hyperparameters

r2:  0.3866433466516488  mse:  121500.81646251371
r2:  0.3866433466516488  mse:  121500.81646251371


**Part II. Variable Selection**

Based on the above results, decide on:

- Which numeric variable is most important.

- Which five numeric variables are most important

- Which categorical variable is most important

For each of the four model specifications, compare the following possible feature sets:

1. Using only the one best numeric variable.

2. Using only the five best variables.

3. Using the five best numeric variables and their interactions with the one best categorical variable.

Report which combination of features and model performed best, based on the validation metric of MSE.

(Note: 
 and 
 must be re-tuned for each feature set.)

**Variable Selection**

1. Most Important Numeric Variable: CRuns

2. Top 5 Most Important Numeric Variables: CRuns, PutOuts, Hits, CRBI, Walks

3. Most Important Categorical Variable: Division

**A. Regression without regularization**

**#1**

In [23]:
#Using only the one best numeric variable.
X = baseball[["CRuns"]]
y = baseball["Salary"]

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.29227548589102115  mse:  143812.9359162973
r2:  0.29227548589102115  mse:  143812.9359162973


**#2**

In [24]:
#Using only the five best variables.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]]
y = baseball["Salary"]

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.37722674799137945  mse:  121332.85377811702
r2:  0.37722674799137945  mse:  121332.85377811702


**#3**

In [25]:
#Using the five best numeric variables and their interactions with the one best categorical variable.

X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    #drop = "first" should make it so that division is not included in the model
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Division"]),
    ("standardize", StandardScaler(), ["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]),
  ],
  remainder = "drop"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_W"]),
    ("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_W"]),
    ("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_W"]),
    ("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_W"]),
    ("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_W"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline_linear = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")



In [26]:
#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.320279729122522  mse:  130401.97016858231
r2:  0.320279729122522  mse:  130401.97016858231


**B. Ridge regression**

**#1**

In [27]:
#retune hyperparameter and score how well the model did (in MSE)

#Using only the one best numeric variable.
X = baseball[["CRuns"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_ridge, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True)



Unnamed: 0,ridge_regression__alpha,scores
4,10.0,143658.517369
3,1.0,143783.706854
2,0.1,143809.862456
1,0.01,143812.627051
0,0.001,143812.905015
5,100.0,148807.783475


**#2**

In [28]:
#retune hyperparameter and score how well the model did (in MSE)

#Using only the five best variables.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_ridge, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True)


Unnamed: 0,ridge_regression__alpha,scores
5,100.0,119398.400113
4,10.0,119969.294948
3,1.0,121103.3535
2,0.1,121307.910944
1,0.01,121330.337137
0,0.001,121332.601888


**#3**

In [29]:
#pipeline
ct = ColumnTransformer(
  [
    #drop = "first" should make it so that division is not included in the model
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Division"]),
    ("standardize", StandardScaler(), ["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]),
  ],
  remainder = "drop"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_W"]),
    ("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_W"]),
    ("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_W"]),
    ("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_W"]),
    ("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_W"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline_ridge = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  #changed alpha = 100
   ("ridge_regression", Ridge(alpha = 100))]
).set_output(transform="pandas")


In [30]:
#retune hyperparameter

#Using the five best numeric variables and their interactions with the one best categorical variable.

X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_ridge, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores'] * -1

results_df.sort_values(by = 'scores', ascending = True).head()

Unnamed: 0,ridge_regression__alpha,scores
5,100.0,116972.675912
4,10.0,118503.794116
3,1.0,125192.21212
2,0.1,129543.747309
1,0.01,130310.159254


**C. Lasso Regression**

**#1**

In [31]:
#retune hyperparameter and score how well the model did (in MSE)

#Using only the one best numeric variable.
X = baseball[["CRuns"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_lasso, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True)



Unnamed: 0,lasso_regression__alpha,scores
4,10.0,143793.449159
3,1.0,143801.606084
2,0.1,143811.709122
1,0.01,143812.812299
0,0.001,143812.923545
5,100.0,152999.224797


**#2**

In [32]:
#retune hyperparameter and score how well the model did (in MSE)

#Using only the five best variables.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_lasso, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True)


Unnamed: 0,lasso_regression__alpha,scores
1,0.01,121332.402601
2,0.1,121332.659779
0,0.001,121332.893433
3,1.0,121337.557109
4,10.0,121600.010609
5,100.0,142592.153396


**#3**

In [33]:
ct = ColumnTransformer(
  [
    #drop = "first" should make it so that division is not included in the model
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Division"]),
    ("standardize", StandardScaler(), ["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]),
  ],
  remainder = "drop"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_W"]),
    ("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_W"]),
    ("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_W"]),
    ("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_W"]),
    ("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_W"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline_lasso = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  #found that alpha = 10 resulted in the highest MSE, so changed alpha = 10
  ("lasso_regression", Lasso(alpha = 10))]
).set_output(transform="pandas")


In [34]:
#retune hyperparameter and score how well the model did (in MSE)

#Using the five best numeric variables and their interactions with the one best categorical variable.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]

#tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_lasso, lambdas, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,lasso_regression__alpha,scores
4,10.0,119129.330501
3,1.0,126202.590816
2,0.1,129953.501522
1,0.01,130358.354217
0,0.001,130397.415306
5,100.0,142592.153396


**D. Elastic Net**

**#1**

In [35]:
#Using only the one best numeric variable.
X = baseball[["CRuns"]]
y = baseball["Salary"]

#Use cross-validation to tune the lambda and alpha hyperparameters.

#tune lambdas/alphas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_elastic, values, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True).head()

Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
22,0.1,0.5,143655.076048
21,0.1,0.4,143660.021761
23,0.1,0.6,143661.305279
20,0.1,0.3,143675.521978
24,0.1,0.7,143679.366378


**#2**

In [36]:
#Using only the five best variables.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]]
y = baseball["Salary"]

#tune lambdas/alphas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_elastic, values, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True).head()

Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
33,1.0,0.7,118839.350432
34,1.0,0.8,118860.691338
32,1.0,0.6,119107.904838
35,1.0,0.9,119352.498787
18,0.1,0.1,119416.439601


**#3**

In [37]:
ct = ColumnTransformer(
  [
    #drop = "first" should make it so that division is not included in the model
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Division"]),
    ("standardize", StandardScaler(), ["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]),
  ],
  remainder = "drop"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_W"]),
    ("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_W"]),
    ("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_W"]),
    ("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_W"]),
    ("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_W"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

#elastic pipeline
lr_pipeline_elastic = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  #can now put our custom alpha and l1_ratio that we found in the previous step
  ("elastic_regression", ElasticNet(alpha = 1, l1_ratio = .8))]
).set_output(transform="pandas")


In [38]:
#Using the five best numeric variables and their interactions with the one best categorical variable.

X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]

#tune lambdas/alphas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_elastic, values, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df['scores'] = results_df['scores']*-1

results_df.sort_values(by = 'scores', ascending = True).head()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
34,1.0,0.8,115700.337508
33,1.0,0.7,115902.392141
32,1.0,0.6,116443.528508
35,1.0,0.9,116452.419582
18,0.1,0.1,116925.964283


**Part 2 Conclusion**

From all the combinations of features and models, the best performing one is using an elastic regression with parameters alpha = 1 and l1_ratio = .8.

The variables included in the model were five best numeric variables and their interactions with the one best categorical variable

So in this case it was the following variables: "CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"

the MSE value I got for the best model (so the lowest MSE) is 115700.337508

**Part III. Discussion**

**A. Ridge**

Compare your Ridge models with your ordinary regression models. How did your coefficients compare? Why does this make sense?

**B. LASSO**

Compare your LASSO model in I with your three LASSO models in II. Did you get the same results? Why does this make sense? Did you get the same MSEs? Why does this make sense?

**C. Elastic Net**

Compare your MSEs for the Elastic Net models with those for the Ridge and LASSO models. Why does it make sense that Elastic Net always“wins”?

**Part IV: Final Model**

Fit your final best pipeline on the full dataset, and summarize your results in a few short sentences and a plot.

In [47]:
#pipeline

ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_W"]),
    ("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_W"]),
    ("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_W"]),
    ("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_W"]),
    ("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_W"]),
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

#elastic pipeline
lr_pipeline_elastic = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  #can now put our custom alpha and l1_ratio that we found in the previous step
  ("elastic_regression", ElasticNet(alpha = 1, l1_ratio = .8))]
).set_output(transform="pandas")


In [48]:
#score how well the model did using every variable as a predictor 

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]

#R^2
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.3702109391472586  mse:  121574.53101457682
r2:  0.3702109391472586  mse:  121574.53101457682


Overall the pipeline did a decent job when fitting the full dataset. Although the MSE is a little bit higher, this is definitely something that is expected as there is a penalty for every beta we add. The model didn't do aswell as our best model found in part 3, but was definitely comparable (only off by around 3% in terms of R^2). Additionally, if we look back at our pipeline from part 1 (of the elastic net) we can see that it did slightly worse in score (lower by around 1% in terms of R^2). The fit we had for our pipeline was optimized for when we were using the five best numeric variables and their interactions with the one best categorical variable, but now that we have incorporated every variable in our dataset we can see it is not as effecient. We can also see this by the fact that we have different tuned hyperparameters from this pipeline versus the one found in part 1.

In [None]:
#plot