In [7]:
#libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Read the data
ames = pd.read_csv("data/AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [38]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  #ridge regression where alpha is lambda in the formula
  #can put the best alpha and lr_ratio we found from our gridsearch b/c when we gridsearch it overrides whatever we put here
  ("ridge_regression", Ridge(alpha = 10))]
)

In [4]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

#scores are way better when using ridge regression

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

**Loss Function**

If we use a lot of betas there should be a penalty on the SSR 

Where the penalty increases as beta get better but the SSR will also go lower 

Ridge uses this penalty when computing model efficiency -> "Ridge Penalty"

Penalties calculated using B^2

In [39]:
#can tune lambdas to test multiple vals
lambdas = {"ridge_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')



In [40]:
gscv_fitted = gscv.fit(X, y)

#gscv_fitted.cv_results_

In [41]:
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,ridge_regression__alpha,scores
4,10.0,0.864272
3,1.0,0.861932
5,100.0,0.857773
2,0.1,0.856302
1,0.01,0.854186
0,0.001,0.85392


In [48]:
#get coefficients of best R^2 model
pipeline_fit = lr_pipeline_1.fit(X, y)
ridge_coef = pipeline_fit.named_steps["ridge_regression"].coef_
ridge_coef

array([-4.90649460e+03,  2.34694314e+03, -1.22917740e+03,  4.62091189e+03,
        2.20428500e+03, -3.03646802e+03, -6.17531422e+03,  6.17531422e+03,
        2.12575125e+03,  7.53147205e+03, -1.22865537e+04,  2.62933036e+03,
       -1.10363100e+04,  9.09887001e+03, -1.49378054e+03,  3.43122056e+03,
        2.93687893e+03, -2.42162664e+03, -5.15252289e+02,  1.43157683e+02,
        7.56730650e+03, -5.62761832e+03, -2.75896133e+03,  6.76115464e+02,
       -3.78227675e+02,  5.60816320e+03, -5.22993552e+03, -3.76824510e+03,
        2.45905383e+03,  5.70332014e+03, -6.25300540e+02, -4.25100049e+03,
       -8.51682634e+03,  9.07661139e+03, -1.55819272e+04, -1.11163614e+04,
        1.72302347e+03,  1.23832549e+04, -6.48871508e+03, -3.70052362e+02,
        1.59468710e+03, -1.01764141e+04, -1.10938115e+04,  4.18711022e+03,
       -1.32553889e+04,  2.75168983e+04,  2.20925274e+04, -9.78994038e+03,
       -7.01890866e+03, -6.59706874e+03, -8.97446647e+03,  9.31639420e+03,
        3.08242659e+04, -

**LASSO**

Same penalty as before except penalties calculated using absolute(Betas)

consequence: some betas will estimate to be 0 if big enough

In [49]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  #ridge regression where alpha is lambda in the formula
  #can put the best alpha and lr_ratio we found from our gridsearch b/c when we gridsearch it overrides whatever we put here
  ("lasso_regression", Lasso(alpha = 100))]
)

In [50]:
#can tune lambdas to test multiple vals
lambdas = {"lasso_regression__alpha": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_1, lambdas, cv = 5, scoring='r2')



In [51]:
gscv_fitted = gscv.fit(X, y)

#gscv_fitted.cv_results_

  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


In [52]:
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,lasso_regression__alpha,scores
5,100.0,0.866931
4,10.0,0.860632
3,1.0,0.857152
2,0.1,0.856618
1,0.01,0.855606
0,0.001,0.855499


In [53]:
#get coefficients of best R^2 model
pipeline_fit = lr_pipeline_1.fit(X, y)
lasso_coef = pipeline_fit.named_steps["lasso_regression"].coef_
lasso_coef

array([-0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -3.40179046e+03, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  2.91561103e+03, -0.00000000e+00,  0.00000000e+00,
       -1.02299521e+04,  5.27128450e+03, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        5.35462663e+03, -2.63928711e+03, -0.00000000e+00,  1.77446540e+02,
       -0.00000000e+00,  2.11317738e+03, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  2.14107952e+03,  0.00000000e+00,
       -0.00000000e+00,  1.41381624e+04, -7.39480667e+03, -1.20939271e+03,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -6.88307419e+02, -3.10766828e+03,  0.00000000e+00,
       -4.43798790e+03,  3.63801042e+04,  2.61893814e+04, -2.59146809e+03,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  1.40936035e+04,
        3.79963928e+04,  

**Elastic Nets**

Inlcudes both penalties when calculating penalty

where is still alpha (lambda)

and lr_ratio (a) is the ratio between the first degree penalty compared to the second degree penalty

In [27]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  #ridge regression where alpha is lambda in the formula
  #can put the best alpha and lr_ratio we found from our gridsearch b/c when we gridsearch it overrides whatever we put here
  ("elastic_regression", ElasticNet(alpha = .01, l1_ratio = .5))]
)

In [33]:
#can tune lambdas to test multiple vals
values = {"elastic_regression__alpha": [.001, .01, .1, 1, 10, 100],
          "elastic_regression__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

gscv = GridSearchCV(lr_pipeline_1, values, cv = 5, scoring='r2')



In [None]:
gscv_fitted = gscv.fit(X, y)

#gscv_fitted.cv_results_

In [35]:
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,elastic_regression__alpha,elastic_regression__l1_ratio,scores
13,0.01,0.5,0.864268
14,0.01,0.6,0.86426
12,0.01,0.4,0.864214
15,0.01,0.7,0.864157
11,0.01,0.3,0.864119
10,0.01,0.2,0.863994
16,0.01,0.8,0.863888
9,0.01,0.1,0.863848
26,0.1,0.9,0.863687
17,0.01,0.9,0.863194


In [None]:
#get coefficients of best R^2 model
pipeline_fit = lr_pipeline_1.fit(X, y)
elastic_coef = pipeline_fit.named_steps["elastic_regression"].coef_
elstic_coef