In [63]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [64]:
#data
baseball = pd.read_csv('data/Hitters.csv')

In [65]:
#Although it isn’t listed as a specific question, don’t forget to clean your data at the beginning. 
#How will you handle missing data?
#Are there any variables that need adjusting?

#clean data
#find number of NA's per column
na_counts = baseball.isna().sum()
print(na_counts)

#only NA's are salary
#for now will choose to remove the observations where salary is NA
baseball = baseball.dropna()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64
AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64


**Part I: Different Model Specs**

**A. Regression without regularization**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

2. Fit this pipeline to the full dataset, and interpret a few of the most important coefficients.

3. Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [66]:

#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [67]:
# Fit and transform the data
X_1 = ct.fit_transform(X)

# Retrieve feature names
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Create a DataFrame with the transformed data
X_1_df = pd.DataFrame(X_1, columns=all_feature_names)
X_1_df.head()

#all columns are there

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
0,0.0,1.0,0.0,1.0,0.0,1.0,-0.6029,-0.595675,-0.528551,-1.206112,-0.522063,-0.097527,1.397893,0.346791,0.174373,-0.00292,-0.121671,0.258966,0.435334,1.221499,-0.523191,0.213352
1,1.0,0.0,0.0,1.0,1.0,0.0,0.512542,0.49226,0.729966,0.441515,0.79406,1.609373,-0.9012,-0.452865,-0.409892,-0.076054,-0.415105,-0.19959,0.010373,2.109109,-0.253863,0.819964
2,0.0,1.0,1.0,0.0,0.0,1.0,0.628167,0.73649,0.958788,0.402286,1.026317,-0.189792,0.770868,1.301558,1.318174,1.898565,1.412051,1.572666,0.355654,-0.324661,-0.744179,-0.848219
3,0.0,1.0,1.0,0.0,0.0,1.0,-0.562092,-0.462459,-0.185319,-0.617673,-0.367225,-0.512719,-1.110209,-0.990935,-0.960153,-0.697693,-0.947521,-0.881228,-0.862315,1.840678,-0.543909,-0.696566
4,1.0,0.0,0.0,1.0,1.0,0.0,1.294712,1.358167,-0.871783,0.755349,-0.01884,-0.282057,0.770868,0.766993,0.634985,-0.61237,0.422846,0.017294,-0.251434,-0.031177,2.087225,2.488147


**#2**

In [68]:
# Fit the pipeline to the full dataset
lr_pipeline_1.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_1.named_steps["linear_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df


Unnamed: 0,Feature,Coefficient
0,League_A,-31.299712
1,League_N,31.299712
2,Division_E,58.424623
3,Division_W,-58.424623
4,NewLeague_A,12.381163
5,NewLeague_N,-12.381163
6,AtBat,-291.094556
7,Hits,337.830479
8,HmRun,37.853837
9,Runs,-60.572479


Some of the most important coefficients in this case, we see that we have all the categorical options for all the dummy variables we converted. So in order to accurately estimate that we would find the total amount between the two groups.

For instance for league_N and league_A: A player's salary is estimated to make 31.299712+31.299712=62.599424 thousand MORE if they play in the A league versus the N league. This would work the same for the NewLeague dummy variable as well as the Division dummy variable.

The highest coefficients we see in our models are CRuns: 480, Hits: 337 and CAtBat: -391

CRuns: For every 1 standard deviation increase in number of home runs during a player's career, there is an estimated 480 (thousand) increase in that players salary

Hits: For every 1 standard deviation increase in the player's number of hits in 1986, there is an estimated 337 (thousand) increase in that players salary

CAtBat: For every 1 standard deviation increase in the player's number of times at bar during their career, there is an estimated -391 (thousand) DECREASE in that players salary

**#3**

In [69]:
#Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.33940791514294444  mse:  121556.06500734284
r2:  0.33940791514294444  mse:  121556.06500734284


**B. Ridge regression**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

2. Use cross-validation to tune the lambda hyperparameter.

3. Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [70]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = 1))]
)

**#2**

In [71]:
#Use cross-validation to tune the hyperparameter.

#tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,ridge_regression__alpha,scores
5,100.0,0.385012
4,10.0,0.368328
3,1.0,0.355767
2,0.1,0.347675
1,0.01,0.344084
0,0.001,0.343556


**#3**

In [72]:
#Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_linear = Pipeline(
  [("preprocessing", ct),
  #changed alpha = 100
  ("ridge_regression", Ridge(alpha = 100))]
)

In [73]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_linear.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_linear.named_steps["ridge_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-11.051842
1,League_N,11.051842
2,Division_E,38.023222
3,Division_W,-38.023222
4,NewLeague_A,-4.09159
5,NewLeague_N,4.09159
6,AtBat,-0.56737
7,Hits,49.612386
8,HmRun,-1.464159
9,Runs,29.343263


The main changes we see are that a lot of the very large coefficients of variables have changed (specifically CRuns, Hits, and CAtBat). Additionally, the coefficient for CAtBat is now positive!

CRuns: For every 1 standard deviation increase in number of home runs during a player's career, there is an estimated 44.53 (thousand) increase in that players salary

Hits: For every 1 standard deviation increase in the player's number of hits in 1986, there is an estimated 49.61 (thousand) increase in that players salary

CAtBat: For every 1 standard deviation increase in the player's number of times at bar during their career, there is an estimated 24.70 (thousand) INCREASE in that players salary)

Overall most of the coefficients of our predictors are lower.

Some of the other most important coefficients now include PutOuts and CRBI

**#4**

In [74]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.38501221231517163  mse:  120716.43558937623
r2:  0.38501221231517163  mse:  120716.43558937623


**C. Lasso Regression**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

2. Use cross-validation to tune the alpha hyperparameter.

3. Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [75]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary lasso regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Ridge(alpha = 1))]
)

**#2**

In [76]:
#Use cross-validation to tune the hyperparameter.

#tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,lasso_regression__alpha,scores
5,100.0,0.385012
4,10.0,0.368328
3,1.0,0.355767
2,0.1,0.347675
1,0.01,0.344084
0,0.001,0.343556


**#3**

In [77]:
#Fit the pipeline with your chosen to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_lasso = Pipeline(
  [("preprocessing", ct),
  #changed alpha = 100
  ("lasso_regression", Ridge(alpha = 100))]
)

In [78]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_lasso.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_lasso.named_steps["lasso_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-11.051842
1,League_N,11.051842
2,Division_E,38.023222
3,Division_W,-38.023222
4,NewLeague_A,-4.09159
5,NewLeague_N,4.09159
6,AtBat,-0.56737
7,Hits,49.612386
8,HmRun,-1.464159
9,Runs,29.343263


Like the ridge regressiom pipeline, the main changes we see are that a lot of the very large coefficients of variables have changed (specifically CRuns, Hits, and CAtBat). Additionally, the coefficient for CAtBat is now positive!

The main difference is that the numbers have changed by a little but still no major changes. (With Hits and CAtBat staying the same exact value)

CRuns: For every 1 standard deviation increase in number of home runs during a player's career, there is an estimated 45.50 (thousand) increase in that players salary

Hits: For every 1 standard deviation increase in the player's number of hits in 1986, there is an estimated 49.61 (thousand) increase in that players salary

CAtBat: For every 1 standard deviation increase in the player's number of times at bar during their career, there is an estimated 24.70 (thousand) INCREASE in that players salary)

Overall most of the coefficients of our predictors are lower.

Some of the other most important coefficients now include PutOuts and CRBI

**#4**

In [79]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_lasso, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_lasso, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.38501221231517163  mse:  120716.43558937623
r2:  0.38501221231517163  mse:  120716.43558937623


**D. Elastic Net**

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary elastic net regression

2. Use cross-validation to tune the lambda and alpha hyperparameters.

3. Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

**#1**

In [80]:
#Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary elastic net regression

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elastic_regression", ElasticNet(alpha = .01, l1_ratio = .5))]
)

**#2**

In [81]:
#Use cross-validation to tune the lambda and alpha hyperparameters.

#Use cross-validation to tune the hyperparameter.

#tune lambdas/alphas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_1, values, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)









Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
28,1.0,0.2,0.386643
29,1.0,0.3,0.386557
27,1.0,0.1,0.386511
30,1.0,0.4,0.386203
31,1.0,0.5,0.385538
32,1.0,0.6,0.384429
44,10.0,0.9,0.383699
33,1.0,0.7,0.382184
34,1.0,0.8,0.378885
35,1.0,0.9,0.374385


**#3**

In [82]:
#Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

X = baseball.drop(["Salary"], axis = 1)
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_elastic = Pipeline(
  [("preprocessing", ct),
  #can now put our custom alpha and l1_ratio that we found in the previous step
  ("elastic_regression", ElasticNet(alpha = 1, l1_ratio = .2))]
)

In [83]:
#output all variables coeff.
# Fit the pipeline to the full dataset
lr_pipeline_elastic.fit(X, y)

# Get feature names from the ColumnTransformer
#REFRENCED CHAT GPT TO GET COLUMN NAMES TO MAKE SURE ALL COLS WERE SELECTED
ohe_feature_names = ct.named_transformers_['dummify'].get_feature_names_out(X.select_dtypes(include='object').columns)
num_feature_names = X.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([ohe_feature_names, num_feature_names])

# Get coefficients from the linear regression model
coefficients = lr_pipeline_elastic.named_steps["elastic_regression"].coef_

# Create a DataFrame to display feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": coefficients
})

# Display the top coefficients
coeff_df

Unnamed: 0,Feature,Coefficient
0,League_A,-7.248569
1,League_N,7.248595
2,Division_E,26.103984
3,Division_W,-26.103971
4,NewLeague_A,-4.116523
5,NewLeague_N,4.116546
6,AtBat,12.179419
7,Hits,37.450668
8,HmRun,5.609999
9,Runs,27.011028


The elastic regression with parameters lambda = 1 and l1_ratio = .2 definitely has similiar coefficients to the previous steps (that is because the elastic regression uses a combination of two calculations to find its own penalty formula) Where that ratio of which type is more effective is determined by the l1_ratio.

Because 0 would mean a pure ridge regression and 1 would mean a pure lasso regression

that means that essentially 80% of the penalty weight is on a ridge penalty and 20% is on a lasso penalty. As a result we can see that are values are a lot more similiar to the ridge regression, (even though we used a completely different alpha value)

Some of the most notable coefficient changes from the previous regression (lasso) are that CWalks has tripled. CRBI, CRuns, Walks, Hits, and CHits are all lower.

Interpretation of CWalks output: For every 1 standard deviation increase in a players number of walks in 1986, their salary is expected to increase by 34.56 (thousand)

**#4**

In [84]:
#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_elastic, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

#this combination has given us the highest R^2 and lowest MSE compared to all the previous regression with their corresponding hyperparameters

r2:  0.3866433466516487  mse:  121500.81646251371
r2:  0.3866433466516487  mse:  121500.81646251371


**Part II. Variable Selection**

Based on the above results, decide on:

- Which numeric variable is most important.

- Which five numeric variables are most important

- Which categorical variable is most important

For each of the four model specifications, compare the following possible feature sets:

1. Using only the one best numeric variable.

2. Using only the five best variables.

3. Using the five best numeric variables and their interactions with the one best categorical variable.

Report which combination of features and model performed best, based on the validation metric of MSE.

(Note: 
 and 
 must be re-tuned for each feature set.)

**Variable Selection**

1. Most Important Numeric Variable: CRuns

2. Top 5 Most Important Numeric Variables: CRuns, PutOuts, Hits, CRBI, Walks

3. Most Important Categorical Variable: Division

**A. Regression without regularization**

**#1**

In [85]:
#Using only the one best numeric variable.
X = baseball[["CRuns"]]
y = baseball["Salary"]

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.266884498380676  mse:  148807.7834751813
r2:  0.266884498380676  mse:  148807.7834751813


**#2**

In [86]:
#Using only the five best variables.
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]]
y = baseball["Salary"]

#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.4000670374532286  mse:  119398.40011345323
r2:  0.4000670374532286  mse:  119398.40011345323


**#3**

In [120]:
#Using the five best numeric variables and their interactions with the one best categorical variable.

X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]


ct = ColumnTransformer(
  [
    #drop = "first" should make it so that division is not included in the model
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Division"]),
    ("standardize", StandardScaler(), ["CRuns", "PutOuts", "Hits", "CRBI", "Walks"]),
  ],
  remainder = "drop"
)

#interaction terms
ct_inter = ColumnTransformer(
  [
    #i think this part makes it so that division by itself is not included in the model
    ("interaction_CRuns_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRuns", "dummify__Division_E"]),
    #("interaction_PutOuts_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__PutOuts", "dummify__Division_E"]),
    #("interaction_Hits_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Hits", "dummify__Division_E"]),
    #("interaction_CRBI_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__CRBI", "dummify__Division_E"]),
    #("interaction_Walks_Division", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Walks", "dummify__Division_E"]),
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")


lr_pipeline_linear = Pipeline(
  [("preprocessing", ct),
   ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
)


In [121]:
#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/__init__.py", line 482, in _get_column_indices
    all_columns = X.columns
                  ^^^^^^^^^
AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 906, in fit_transform
    self._validate_column_callables(X)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 496, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/__init__.py", line 484, in _get_column_indices
    raise ValueError(
ValueError: Specifying the columns using strings is only supported for dataframes.


In [122]:
# Define feature matrix and target variable
X = baseball[["CRuns", "PutOuts", "Hits", "CRBI", "Walks", "Division"]]
y = baseball["Salary"]

# Preprocessing with automatic column selection
ct = ColumnTransformer(
    [
        # One-hot encode the categorical variable
        ("dummify", OneHotEncoder(sparse_output=False, drop="first"), make_column_selector(dtype_include="object")),
        
        # Standardize numeric features
        ("standardize", StandardScaler(), make_column_selector(dtype_include="number"))
    ],
    remainder="drop"
)

# Full pipeline with PolynomialFeatures for interaction terms
lr_pipeline_linear = Pipeline([
    ("preprocessing", ct),
    #interactions?
    ("interaction", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("linear_regression", LinearRegression())
])

R^2:  0.42731129570461857
MSE:  105677.17614174394
R^2:  0.42731129570461857
MSE:  105677.17614174394


In [123]:
#score how well the model did
#R^2
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='r2')
r2 = scores.mean()

#MSE
scores = cross_val_score(lr_pipeline_linear, X, y, cv=5, scoring='neg_mean_squared_error')
mse = scores.mean()*-1

print("r2: ", r2, " mse: ", mse)

r2:  0.42731129570461857  mse:  105677.17614174394
r2:  0.42731129570461857  mse:  105677.17614174394


**B. Ridge regression**

**#1**

**#2**

**#3**

**C. Lasso Regression**

**#1**

**#2**

**#3**

**D. Elastic Net**

**#1**

**#2**

**#3**