Ridge regression
---

Solution - Basic feature engineering for the house prices data
---

> **Exercise**: (A) load the house prices data from `c3_house-prices.csv` (B) plot the distribution of the continuous variables using histograms: you should see that many have a skewed one (C) create a `preprocess(df)` function which performs (C.1) one-hot encoding (C.2) fill missing values (C.3) apply a **log-transform** to every continuous feature and (C.4) add their **polynomial features** of degree 2, 3 and 0.5 (square root). Finally (D) create the X/y numpy arrays - use the `np.log10()` of the sale price as the target variable.

**Hint**: The logarithm of zero doesn't exist, so we have to make sure that there are no zero values in the continuous columns when applying the log-transform. To achieve this, we can use `np.log(x+1)` (or simply `np.log1p(x)` which is equivalent) which will leave zero values untransformed i.e. `log(1)=0`

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.datasets import fetch_openml
from scipy import stats
from scipy.stats import norm, skew

%load_ext lab_black
%matplotlib inline

# (A) Load the data
data_df = pd.read_csv("house-prices-test.csv")
data_df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
0,2217,909279080,50,RL,,11275,Pave,,IR1,HLS,...,0,0,,,,0,3,2007,WD,Normal
1,837,907126050,20,RL,65.0,9757,Pave,,Reg,Low,...,92,0,,,,0,10,2009,WD,Normal
2,2397,528144030,60,RL,86.0,11065,Pave,,IR1,Lvl,...,0,0,,,,0,10,2006,New,Partial
3,1963,535452060,20,RL,70.0,7000,Pave,,Reg,Lvl,...,0,0,,MnWw,,0,4,2007,WD,Family
4,306,911202100,50,C (all),66.0,8712,Pave,Pave,Reg,HLS,...,0,0,,,,0,1,2010,WD,Abnorml


In [2]:
# We fill the NaN values of the columns saved in unique. For filling this columns, there is an only value that may be incorporated (as it was previously stated)
data_df.drop(["Central Air"], axis=1, inplace=True)
data_df.drop(["Street"], axis=1, inplace=True)

In [3]:
# We calculate the null percentage:
null_percentage = data_df.isnull().sum() / data_df.shape[0] * 100
# Drop columns with more than a 40% of missing values
for x in range(0, len(null_percentage)):
    if null_percentage[x] >= 40:
        print(data_df.index[x])
        data_df.drop([null_percentage.index[x]], axis=1, inplace=True)

6
56
71
72
73


In [4]:
data_df.drop("Order", inplace=True, axis=1)

In [5]:
null_list = data_df.columns[data_df.isnull().any()].tolist()
data_df[null_list].info()
data_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Lot Frontage    430 non-null    float64
 1   Mas Vnr Type    497 non-null    object 
 2   Mas Vnr Area    497 non-null    float64
 3   Bsmt Qual       491 non-null    object 
 4   Bsmt Cond       491 non-null    object 
 5   Bsmt Exposure   491 non-null    object 
 6   BsmtFin Type 1  491 non-null    object 
 7   BsmtFin Type 2  491 non-null    object 
 8   Garage Type     479 non-null    object 
 9   Garage Yr Blt   479 non-null    float64
 10  Garage Finish   479 non-null    object 
 11  Garage Qual     479 non-null    object 
 12  Garage Cond     479 non-null    object 
dtypes: float64(3), object(10)
memory usage: 50.9+ KB


Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
0,909279080,50,RL,,11275,IR1,HLS,AllPub,Corner,Mod,...,0,19,0,0,0,0,3,2007,WD,Normal
1,907126050,20,RL,65.0,9757,Reg,Low,AllPub,Inside,Mod,...,0,0,0,92,0,0,10,2009,WD,Normal
2,528144030,60,RL,86.0,11065,IR1,Lvl,AllPub,Inside,Gtl,...,74,0,0,0,0,0,10,2006,New,Partial
3,535452060,20,RL,70.0,7000,Reg,Lvl,AllPub,Inside,Gtl,...,16,0,0,0,0,0,4,2007,WD,Family
4,911202100,50,C (all),66.0,8712,Reg,HLS,AllPub,Inside,Mod,...,0,98,0,0,0,0,1,2010,WD,Abnorml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,905378040,50,RL,124.0,18600,Reg,Lvl,AllPub,Inside,Gtl,...,0,60,0,0,0,450,6,2010,WD,Normal
496,533252020,20,RL,,11250,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,6,2006,WD,Normal
497,527127100,120,RL,28.0,7296,IR1,Lvl,AllPub,CulDSac,Gtl,...,57,0,0,0,0,0,11,2009,WD,Normal
498,906340120,20,RL,61.0,10226,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,1,2009,WD,Normal


In [6]:
# Replace numerical NaN values
# Mean replacements (near to normal distributions)
data_df["Lot Frontage"].fillna(int(data_df["Lot Frontage"].mean()), inplace=True)
data_df["Total Bsmt SF"].fillna(int(data_df["Total Bsmt SF"].mean()), inplace=True)
data_df["Garage Area"].fillna(int(data_df["Garage Area"].mean()), inplace=True)
# Median replacements (skewed distributions)
data_df["Garage Yr Blt"].fillna(int(data_df["Garage Yr Blt"].median()), inplace=True)
data_df["Mas Vnr Area"].fillna(int(data_df["Mas Vnr Area"].median()), inplace=True)
data_df["Bsmt Unf SF"].fillna(int(data_df["Bsmt Unf SF"].median()), inplace=True)
data_df["BsmtFin SF 1"].fillna(int(data_df["BsmtFin SF 1"].median()), inplace=True)
data_df["Bsmt Half Bath"].fillna(int(data_df["Bsmt Half Bath"].median()), inplace=True)
data_df["Bsmt Full Bath"].fillna(int(data_df["Bsmt Full Bath"].median()), inplace=True)
data_df["Garage Cars"].fillna(int(data_df["Garage Cars"].median()), inplace=True)
# Fill categorical features with a new missing category
data_df["Bsmt Qual"].fillna(("Missing"), inplace=True)
data_df["BsmtFin Type 1"].fillna(("Missing"), inplace=True)
data_df["Garage Finish"].fillna(("Missing"), inplace=True)
data_df = data_df.fillna(data_df.mode().iloc[0])

# Check the new list of missing values
null_list = data_df.columns[data_df.isnull().any()].tolist()
data_df[null_list].info()
# Number of rows
print(data_df.shape[0])
# Number of columns
print(data_df.shape[1])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Empty DataFrame500
73


In [7]:
# We will manually filter 8 variables

filter_outlier = data_df["Mas Vnr Area"] > 1000
df_keep = data_df.loc[filter_outlier, "Mas Vnr Area"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Lot Frontage"] > 200
df_keep = data_df.loc[filter_outlier, "Lot Frontage"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Low Qual Fin SF"] > 100
df_keep = data_df.loc[filter_outlier, "Low Qual Fin SF"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Pool Area"] > 500
df_keep = data_df.loc[filter_outlier, "Pool Area"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Garage Yr Blt"] < 1000
df_keep = data_df.loc[filter_outlier, "Garage Yr Blt"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Garage Yr Blt"] > 2022
df_keep = data_df.loc[filter_outlier, "Garage Yr Blt"]
data_df = data_df.drop(df_keep.index, axis=0)

filter_outlier = data_df["Misc Val"] > 1000
df_keep = data_df.loc[filter_outlier, "Misc Val"]
data_df = data_df.drop(df_keep.index, axis=0)


filter_outlier = data_df["BsmtFin SF 1"] > 2500
df_keep = data_df.loc[filter_outlier, "Misc Val"]
data_df = data_df.drop(df_keep.index, axis=0)

print(
    "After eliminating the outliers through manual filtering the new shape is "
    + str(data_df.shape)
)

After eliminating the outliers through manual filtering the new shape is (485, 73)


In [8]:
nominal = []
ordinal = []
# check line by line in txt
with open("documentation.txt", "rt") as f:
    data = f.readlines()
# Generate two lists for Nominal and Ordinal variables
for line in data:
    if "(Nominal)" in line:
        nominal.append(line)

    if "(Ordinal)" in line:
        ordinal.append(line)

# Auxiliar variables for searching in the list of names
sub_str_nom = "(Nominal):"
sub_str_ord = "(Ordinal)"

# Remove after substring in String
nominal_var = []
ordinal_var = []
# slicing nominal variables names
for test_str in nominal:
    res = test_str[: test_str.index(sub_str_nom)][:-1]
    nominal_var.append(res)
# slicing ordinal variables names
for test_str in ordinal:
    res1 = test_str[: test_str.index(sub_str_ord)][:-1]
    ordinal_var.append(res1)

In [9]:
# Remove variables that have been already dropped in the original df
nominal_var.remove("Alley")
nominal_var.remove("Misc Feature")
nominal_var.remove("Street")
nominal_var.remove("Central Air")

# Replace inexact names in .txt
nominal_var.remove("Exterior 1")
nominal_var.append("Exterior 1st")
nominal_var.remove("Exterior 2")
nominal_var.append("Exterior 2nd")

# Replace inexact names in .txt
ordinal_var.remove("BsmtFinType 2")
ordinal_var.append("BsmtFin Type 2")
ordinal_var.remove("HeatingQC")
ordinal_var.append("Heating QC")
ordinal_var.remove("KitchenQual")
ordinal_var.append("Kitchen Qual")
ordinal_var.remove("FireplaceQu")
# Remove variables that have been already dropped in the original df
ordinal_var.remove("Fence")
ordinal_var.remove("Pool QC")

In [10]:
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Reg", "IR1", "IR2", "IR3"], value=[4, 3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["AllPub", "NoSewr", "NoSeWa", "ELO"], value=[4, 3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Gtl", "Mod", "Sev"], value=[3, 2, 1]
)

data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Ex", "Gd", "TA", "Fa", "Po", "NA"], value=[5, 4, 3, 2, 1, 0]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Av", "Mn", "No"], value=[3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf"], value=[6, 5, 4, 3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["SBrkr", "FuseA", "FuseF", "FuseP", "Mix"], value=[5, 4, 3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"],
    value=[8, 7, 6, 5, 4, 3, 2, 1],
)

data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Fin", "RFn"], value=[3, 2]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["Y", "P", "N"], value=[3, 2, 1]
)
data_df[ordinal_var] = data_df[ordinal_var].replace(
    to_replace=["GdPrv", "MnPrv", "GdWo", "MnWw"], value=[4, 3, 2, 1]
)
# Missing values are also replaced with 0
data_df[ordinal_var] = data_df[ordinal_var].replace(to_replace="Missing", value=0)
data_df[ordinal_var]
# df tiene 74 a estas alturas
data_df

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
0,909279080,50,RL,69.0,11275,3,HLS,4,Corner,2,...,0,19,0,0,0,0,3,2007,WD,Normal
1,907126050,20,RL,65.0,9757,4,Low,4,Inside,2,...,0,0,0,92,0,0,10,2009,WD,Normal
2,528144030,60,RL,86.0,11065,3,Lvl,4,Inside,3,...,74,0,0,0,0,0,10,2006,New,Partial
3,535452060,20,RL,70.0,7000,4,Lvl,4,Inside,3,...,16,0,0,0,0,0,4,2007,WD,Family
4,911202100,50,C (all),66.0,8712,4,HLS,4,Inside,2,...,0,98,0,0,0,0,1,2010,WD,Abnorml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,905378040,50,RL,124.0,18600,4,Lvl,4,Inside,3,...,0,60,0,0,0,450,6,2010,WD,Normal
496,533252020,20,RL,69.0,11250,3,Lvl,4,Inside,3,...,42,0,0,0,0,0,6,2006,WD,Normal
497,527127100,120,RL,28.0,7296,3,Lvl,4,CulDSac,3,...,57,0,0,0,0,0,11,2009,WD,Normal
498,906340120,20,RL,61.0,10226,3,Lvl,4,Inside,3,...,42,0,0,0,0,0,1,2009,WD,Normal


In [11]:
# We finally change MS SubClass (the numeric variable) to a non numerical version for the hot-encoding.
data_df["MS SubClass"] = data_df["MS SubClass"].map(str)
pd.value_counts(data_df[nominal_var].dtypes)

object    19
int64      1
dtype: int64

In [12]:
quantitative = [v for v in data_df.columns if data_df.dtypes[v] != "object"]
qualitative = [v for v in data_df.columns if data_df.dtypes[v] == "object"]

In [13]:
data_df["Has Mas Vnr"] = data_df["Mas Vnr Area"].apply(lambda x: 1 if x > 0 else 0)
data_df["Has Basement"] = data_df["Total Bsmt SF"].apply(lambda x: 1 if x > 0 else 0)
data_df["Has 2ndFloor"] = data_df["2nd Flr SF"].apply(lambda x: 1 if x > 0 else 0)
data_df["Has Garage"] = data_df["Garage Area"].apply(lambda x: 1 if x > 0 else 0)
data_df["Has WoodDeck"] = data_df["Wood Deck SF"].apply(lambda x: 1 if x > 0 else 0)
data_df["Has Porch"] = data_df["Open Porch SF"].apply(lambda x: 1 if x > 0 else 0)

In [14]:
# data_df.drop("Total Bsmt SF", inplace=True, axis=1)
# data_df.drop("Exter Qual", inplace=True, axis=1)
# data_df.drop("Garage Yr Blt", inplace=True, axis=1)
# data_df.drop("Garage Area", inplace=True, axis=1)
data_df["New"] = np.where(data_df["Yr Sold"] == data_df["Year Built"], 1, 0)

In [15]:
data_df["Bath"] = (
    data_df["Full Bath"]
    + (0.5 * data_df["Half Bath"])
    + data_df["Bsmt Full Bath"]
    + (0.5 * data_df["Bsmt Half Bath"])
)

data_df.drop("Full Bath", inplace=True, axis=1)
data_df.drop("Half Bath", inplace=True, axis=1)
data_df.drop("Bsmt Full Bath", inplace=True, axis=1)
data_df.drop("Bsmt Half Bath", inplace=True, axis=1)
data_df

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Sale Type,Sale Condition,Has Mas Vnr,Has Basement,Has 2ndFloor,Has Garage,Has WoodDeck,Has Porch,New,Bath
0,909279080,50,RL,69.0,11275,3,HLS,4,Corner,2,...,WD,Normal,1,1,1,1,0,0,0,1.5
1,907126050,20,RL,65.0,9757,4,Low,4,Inside,2,...,WD,Normal,0,1,0,1,1,0,0,2.0
2,528144030,60,RL,86.0,11065,3,Lvl,4,Inside,3,...,New,Partial,1,1,1,1,1,1,1,2.5
3,535452060,20,RL,70.0,7000,4,Lvl,4,Inside,3,...,WD,Family,1,1,0,1,0,1,0,1.0
4,911202100,50,C (all),66.0,8712,4,HLS,4,Inside,2,...,WD,Abnorml,0,1,1,1,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,905378040,50,RL,124.0,18600,4,Lvl,4,Inside,3,...,WD,Normal,0,1,1,1,0,0,0,2.5
496,533252020,20,RL,69.0,11250,3,Lvl,4,Inside,3,...,WD,Normal,0,1,0,1,1,1,0,2.5
497,527127100,120,RL,28.0,7296,3,Lvl,4,CulDSac,3,...,WD,Normal,0,1,0,1,1,1,0,3.0
498,906340120,20,RL,61.0,10226,3,Lvl,4,Inside,3,...,WD,Normal,1,1,0,1,1,1,0,3.0


In [16]:
# Continuous features from the documentation
continuous = [
    "Total Bsmt SF",
    "Lot Frontage",
    "Lot Area",
    "Mas Vnr Area",
    "BsmtFin SF 1",
    "BsmtFin SF 2",
    "Bsmt Unf SF",
    "1st Flr SF",
    "2nd Flr SF",
    "Low Qual Fin SF",
    "Gr Liv Area",
    "Wood Deck SF",
    "Open Porch SF",
    "Enclosed Porch",
    "3Ssn Porch",
    "Garage Area",
    "Screen Porch",
    "Pool Area",
    "Misc Val",
]

In [17]:
import numpy as np

# (C) Preprocessing function
def preprocess(df):
    # Work on a copy
    df = df.copy()

    # (C.1) One-hot encoding
    df = pd.get_dummies(df, dummy_na=True)

    # (C.3) Apply log-transform
    df[continuous] = np.log1p(df[continuous])

    # (C.4) Add polynomial features
    for c in continuous:
        for d in [0.5, 2, 3]:
            name = "{}**{}".format(c, d)
            df[name] = df[c] ** d

    return df


preprocessed_df = preprocess(data_df)
preprocessed_df.head()

Unnamed: 0,PID,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,...,Garage Area**3,Screen Porch**0.5,Screen Porch**2,Screen Porch**3,Pool Area**0.5,Pool Area**2,Pool Area**3,Misc Val**0.5,Misc Val**2,Misc Val**3
0,909279080,4.248495,9.330432,3,4,2,6,7,1932,1950,...,223.730097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,907126050,4.189655,9.185843,4,4,2,5,7,1994,1994,...,225.760274,2.12899,20.544458,93.119801,0.0,0.0,0.0,0.0,0.0,0.0
2,528144030,4.465908,9.311633,3,4,3,8,5,2006,2006,...,290.827058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,535452060,4.26268,8.853808,4,4,3,5,7,1960,2002,...,170.147831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,911202100,4.204693,9.072571,4,4,2,4,7,1900,1950,...,210.989529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# (D) Create X, y
X = preprocessed_df.drop("SalePrice", axis=1).values
y = np.log10(preprocessed_df.SalePrice).values

KeyError: "['SalePrice'] not found in axis"

Solution - Tune Ridge regression
---

> **Exercise**: Fit a ridge regression model and tune its alpha value using grid search. Use the train/test set methodology with a 50/50 split. Print the optimal alpha value and the test MSE/MAE scores.

**Hint**: Don't forget to standardize the input data.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split into train/test sets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.4, random_state=0)

# Standardize features
scaler = StandardScaler()
X_tr_rescaled = scaler.fit_transform(X_tr)
X_te_rescaled = scaler.transform(X_te)

In [None]:
# Mean absolute error (MAE)
def MAE(y, y_pred):
    return np.mean(np.abs(y - y_pred))


#  Huber loss
from sklearn.linear_model import HuberRegressor

# Create the estimator
huber = HuberRegressor(epsilon=1.05, max_iter=10000)
# Fit it to X,y
huber.fit(X_tr_rescaled, y_tr)
y_pred_huber_c = huber.predict(X_te_rescaled)
mae_huber_c = MAE(y_te, y_pred_huber_c)
print(mae_huber_c)

# (C) Plot best model
%matplotlib inline
import matplotlib.pyplot as plt

# Plot models
overall_te_c = X_te_rescaled[:, 5]  # 1: Overall Quality column
plt.scatter(overall_te_c, y_te, s=10, label="test points")
plt.scatter(overall_te_c, y_pred_huber_c, s=10, label="predictions (huber)")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.linear_model import Ridge

# Variable to store the results
gs_results = []

# Grid search
for alpha in np.logspace(-1, 4, num=20):
    # Create and fit ridge regression
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_tr_rescaled, y_tr)

    # Save model and its performance on train/test sets
    gs_results.append(
        {
            "model": ridge,
            "alpha": alpha,
            "train_mse": MSE(y_tr, ridge.predict(X_tr_rescaled)),
            "train_mae": MAE(10 ** y_tr, 10 ** ridge.predict(X_tr_rescaled)),
            "test_mse": MSE(y_te, ridge.predict(X_te_rescaled)),
            "test_mae": MAE(10 ** y_te, 10 ** ridge.predict(X_te_rescaled)),
            "mae": MAE(y_tr, ridge.predict(X_tr_rescaled)),
        }
    )

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
print(gs_results.mae)
# Plot the validation curves
plt.plot(np.log10(gs_results["alpha"]), gs_results["train_mse"], label="train curve")
plt.plot(np.log10(gs_results["alpha"]), gs_results["test_mse"], label="test curve")

# Mark best alpha value
best_result = gs_results.loc[gs_results.test_mse.idxmin()]
plt.scatter(
    np.log10(best_result.alpha), best_result.test_mse, marker="x", c="red", zorder=10
)
plt.title(
    "Best alpha: {:.1e} - mse: {:.4f} mae: {:,.0f}$".format(
        best_result.alpha, best_result.test_mse, best_result.test_mae
    )
)

plt.xlabel("$log_{10}(alpha)$")
plt.ylabel("MSE")
plt.legend()
plt.show()

Alternative solution - average multiple runs
---

In [None]:
# Fit/test N models
gs_results = []
for run_idx in range(10):
    # Split into train/test sets
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.5, random_state=run_idx)

    # Standardize features
    X_tr_rescaled = scaler.fit_transform(X_tr)
    X_te_rescaled = scaler.transform(X_te)

    # Grid search
    for alpha in np.logspace(1, 4, num=20):
        # Create and fit ridge regression
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_tr_rescaled, y_tr)

        # Save model and its performance on train/test sets
        gs_results.append(
            {
                "model": ridge,
                "alpha": alpha,
                "run_idx": run_idx,
                "train_mse": MSE(y_tr, ridge.predict(X_tr_rescaled)),
                "train_mae": MAE(10 ** y_tr, 10 ** ridge.predict(X_tr_rescaled)),
                "test_mse": MSE(y_te, ridge.predict(X_te_rescaled)),
                "test_mae": MAE(10 ** y_te, 10 ** ridge.predict(X_te_rescaled)),
            }
        )

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)

# Group results by alpha value
gb_alpha = gs_results.groupby("alpha")

# Compute train/test mean scores with std
mean_tr = gb_alpha.train_mse.mean()
mean_te = gb_alpha.test_mse.mean()
std_tr = gb_alpha.train_mse.std()
std_te = gb_alpha.test_mse.std()
alphas = mean_tr.index.values

# Plot mean scores
plt.plot(np.log10(alphas), mean_tr, label="train")
plt.plot(np.log10(alphas), mean_te, label="test")

# Quantify variance with ±std curves
plt.fill_between(np.log10(alphas), mean_tr - std_tr, mean_tr + std_tr, alpha=0.2)
plt.fill_between(np.log10(alphas), mean_te - std_te, mean_te + std_te, alpha=0.2)

# Add marker for best score
best_alpha = mean_te.idxmin()
plt.scatter(np.log10(best_alpha), mean_te.min(), marker="x", c="red", zorder=10)

# Print best MSE/MAE scores
best_result = gb_alpha.get_group(best_alpha)
plt.title(
    "Best alpha: {:.1e} - mse: {:.4f} mae: {:,.0f}$".format(
        best_alpha, best_result.test_mse.mean(), best_result.test_mae.mean()
    )
)

plt.xlabel("$log_{10}(alpha)$")
plt.ylabel("MSE")
plt.legend()
plt.show()