Wrap-Up Script from Module 4: [Linear Models](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_wrap_up_quiz.html)

In [1]:
import numpy as np
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]
data_numerical = data[numerical_features]
data_numerical.shape


(1460, 24)

In [2]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

pipe_alpha0 = make_pipeline(StandardScaler(), Ridge(alpha=0.0))
cv0 = cross_validate(pipe_alpha0, data_numerical, target, cv=10, return_estimator=True)

max_abs_coef_alpha0 = max(np.abs(p[-1].coef_).max() for p in cv0["estimator"])
max_abs_coef_alpha0


  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)


np.float64(7.514929864668917e+18)

In [7]:
pipe_alpha1 = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
cv1 = cross_validate(pipe_alpha1, data_numerical, target, cv=10, return_estimator=True)

max_abs_coef_alpha1 = max(np.abs(p[-1].coef_).max() for p in cv1["estimator"])
max_abs_coef_alpha1


np.float64(22562.769198255974)

In [8]:
import pandas as pd
import numpy as np

coefs = np.vstack([p[-1].coef_ for p in cv1["estimator"]])
coef_df = pd.DataFrame(coefs, columns=numerical_features)

mean_abs = coef_df.abs().mean().sort_values(ascending=False)
mean_abs.head(10)


GarageCars      19417.910189
GrLivArea       17119.540942
2ndFlrSF        12201.988699
TotRmsAbvGrd    12152.055864
BedroomAbvGr    12117.822236
TotalBsmtSF     11947.838543
KitchenAbvGr    10791.538866
1stFlrSF         9701.912155
BsmtFinSF1       8574.144931
MasVnrArea       6862.664154
dtype: float64

In [9]:
numerical_features_no_garage_area = [f for f in numerical_features if f != "GarageArea"]
data_numerical_no_garage_area = data[numerical_features_no_garage_area]

pipe_alpha1_no_ga = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
cv1_no_ga = cross_validate(pipe_alpha1_no_ga, data_numerical_no_garage_area, target, cv=10, return_estimator=True)

coefs_no_ga = np.vstack([p[-1].coef_ for p in cv1_no_ga["estimator"]])
coef_df_no_ga = pd.DataFrame(coefs_no_ga, columns=numerical_features_no_garage_area)

# Compare std of GarageCars coef before/after removing GarageArea
std_gc_before = coef_df["GarageCars"].std()
std_gc_after = coef_df_no_ga["GarageCars"].std()

std_gc_before, std_gc_after


(np.float64(2895.2876461017318), np.float64(1305.1393941886613))

In [10]:
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-3, 3, num=101)

pipe_ridgecv = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
cv_ridgecv = cross_validate(pipe_ridgecv, data_numerical_no_garage_area, target, cv=10, return_estimator=True)

coefs_tuned = np.vstack([p[-1].coef_ for p in cv_ridgecv["estimator"]])
coef_tuned_df = pd.DataFrame(coefs_tuned, columns=numerical_features_no_garage_area)

std_gc_tuned = coef_tuned_df["GarageCars"].std()
std_gc_after, std_gc_tuned


(np.float64(1305.1393941886613), np.float64(587.5476658333462))

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV

categorical_features = data.select_dtypes(include="object").columns.tolist()

preprocessor_all = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model_all = Pipeline([
    ("preprocessor", preprocessor_all),
    ("regressor", RidgeCV(alphas=alphas)),
])

preprocessor_simple = ColumnTransformer(
    transformers=[("num", StandardScaler(), numerical_features)]
)

model_simple = Pipeline([
    ("preprocessor", preprocessor_simple),
    ("regressor", RidgeCV(alphas=alphas)),
])

cv_simple = cross_validate(model_simple, data, target, cv=10)
cv_all = cross_validate(model_all, data, target, cv=10)

wins_simple = int(np.sum(cv_simple["test_score"] > cv_all["test_score"]))
wins_simple


0

In [12]:
from sklearn.preprocessing import SplineTransformer
from sklearn.kernel_approximation import Nystroem

preprocessor_nl = ColumnTransformer(
    transformers=[
        ("num", SplineTransformer(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model_nl = Pipeline([
    ("preprocessor", preprocessor_nl),
    ("nystroem", Nystroem(kernel="poly", degree=2, n_components=300, random_state=0)),
    ("regressor", RidgeCV(alphas=alphas)),
])

cv_all_again = cross_validate(model_all, data, target, cv=10)
cv_nl = cross_validate(model_nl, data, target, cv=10)

wins_nl = int(np.sum(cv_nl["test_score"] > cv_all_again["test_score"]))
wins_nl


8