In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [None]:
# Read the data
ames = pd.read_csv("AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [None]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([-2.59303720e+21, -1.13145211e+19, -7.57138616e+20, -4.47669752e+18,
       -2.55949915e+20])

In [None]:
#Using Ridge()
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Ridge(alpha=1))]
)

In [None]:
cross_val_score(lr_pipeline_2, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

In [None]:
ridge_pipeline = Pipeline(
    [("preprocessing", ct), ("ridge_regression", Ridge(alpha=1))]
)

ridge_pipeline.fit(X, y)

ridge_coefficients = ridge_pipeline.named_steps['ridge_regression'].coef_

[-5.58514707e+03  1.27959973e+03 -5.46571776e+03  7.87614164e+03
  3.04609538e+03 -1.15097192e+03 -9.33396395e+03  9.33396395e+03
  1.22549207e+03  7.29888520e+03 -1.08647168e+04  2.34033949e+03
 -8.75436678e+03  9.01003491e+03 -3.97712861e+03  3.72146049e+03
  9.41980874e+03 -8.79255885e+03 -6.27249885e+02  8.17535896e+02
  8.01641445e+03 -5.69763865e+03 -3.98756032e+03  8.51248627e+02
  8.79698364e+02  7.61644748e+03 -8.49614584e+03 -3.15970613e+03
  4.27236551e+03  7.88847839e+03 -5.86113592e+03 -9.15806934e+03
 -1.14550380e+04  6.03134279e+03 -2.05039767e+04 -1.35486631e+04
  5.21751589e+03  6.65306858e+04 -1.21001139e+04 -2.25857464e+03
  8.65313918e+02 -1.52143284e+04 -1.58503469e+04  9.55308503e+03
 -1.82636881e+04  2.78380350e+04  2.24166992e+04 -1.59147424e+04
 -1.30475803e+04 -1.14575200e+04 -1.24710470e+04  1.07458866e+04
  3.69996740e+04 -8.28754419e+03 -9.80700715e+03 -2.58169293e+03
 -3.02692451e+03  6.63253628e+03  7.69143058e+03  1.16940109e+04
 -6.49024359e+03  8.62421

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'ridge_regression__alpha': [0.001, 0.01, 0.1, 1, 10]}

grid_search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X, y)

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
display_columns = ['param_ridge_regression__alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']
print(cv_results[display_columns])


   param_ridge_regression__alpha  mean_test_score  std_test_score  \
0                          0.001         0.853920        0.060278   
1                          0.010         0.854186        0.060280   
2                          0.100         0.856302        0.060250   
3                          1.000         0.861932        0.059104   
4                         10.000         0.864272        0.058157   

   rank_test_score  
0                5  
1                4  
2                3  
3                2  
4                1  


In [None]:
#Using Ridge()
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_3 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Lasso(alpha=1))]
)

In [None]:
cross_val_score(lr_pipeline_3, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(


array([0.89774385, 0.91093785, 0.79691806, 0.77426245, 0.90589888])

In [None]:
lasso_pipeline = Pipeline(
    [("preprocessing", ct), ("lasso_regression", Lasso(alpha=1))]
)

lasso_pipeline.fit(X, y)

lasso_coefficients = lasso_pipeline.named_steps['lasso_regression'].coef_

print(lasso_coefficients)

Lasso Coefficients: [-4.81652839e+03  1.22724439e+03 -4.41885541e+03  7.51982576e+03
  3.53293929e+03 -1.15175663e+03 -2.04473222e+04  3.44113829e-09
 -1.41031927e+03  4.70209827e+03  4.83124102e+02 -1.26904092e+01
 -9.03194952e+03  5.48345937e+03 -1.09548228e+04  1.19715987e+02
  1.48059948e+04 -2.46716222e+03  0.00000000e+00  2.89970113e+02
  6.47083982e+03 -6.06559741e+03 -4.80860054e+03 -0.00000000e+00
 -0.00000000e+00  6.66914878e+03 -1.63491350e+04  3.88720104e+03
  1.02876243e+04  1.35648346e+04  1.05643345e+03 -4.16387582e+03
 -6.38723226e+03  1.16048567e+04 -1.37388021e+04 -7.33924495e+03
  1.20934300e+04  1.32015734e+05 -5.39197495e+03  0.00000000e+00
  7.56702466e+03 -1.01640833e+04 -1.01751839e+04  1.84286710e+04
 -1.33019305e+04  3.10914289e+04  2.52024813e+04 -9.36511398e+03
 -7.37390650e+03 -5.84233241e+03 -7.33009072e+03  1.60093014e+04
  4.27819615e+04 -4.11202868e+03 -7.19672101e+03 -2.09327946e+03
 -0.00000000e+00  8.09540209e+03  6.61843072e+03  1.43357981e+04
 -4.5

In [None]:
param_grid = {'lasso_regression__alpha': [0.001, 0.01, 0.1, 1, 10]}

grid_search = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X, y)

cv_results = pd.DataFrame(grid_search.cv_results_)
display_columns = ['param_lasso_regression__alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']
print(cv_results[display_columns])


   param_lasso_regression__alpha  mean_test_score  std_test_score  \
0                          0.001         0.855499        0.060242   
1                          0.010         0.855606        0.060107   
2                          0.100         0.856618        0.059025   
3                          1.000         0.857152        0.059018   
4                         10.000         0.860632        0.059157   

   rank_test_score  
0                5  
1                4  
2                3  
3                2  
4                1  


In [None]:
#Elastic
elastic_net_pipeline = Pipeline(
    [("preprocessing", ct), ("elastic_net", ElasticNet())]
)

param_grid = {
    "elastic_net__alpha": [0.001, 0.01, 0.1, 1, 10],
    "elastic_net__l1_ratio": [0.2, 0.5, 0.8]
}

grid_search = GridSearchCV(elastic_net_pipeline, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated R^2 score: {grid_search.best_score_:.4f}")

best_elastic_net = grid_search.best_estimator_
best_elastic_net.fit(X, y)
elastic_net_coefficients = best_elastic_net.named_steps["elastic_net"].coef_


  model = cd_fast.enet_coordinate_descent(


Best parameters: {'elastic_net__alpha': 0.01, 'elastic_net__l1_ratio': 0.5}
Best cross-validated R^2 score: 0.8643


  model = cd_fast.enet_coordinate_descent(
