From coursebook: Ch 13

In [35]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# 13.2.1.1 Chapter One

lr = LinearRegression()


ames = pd.read_csv("/Users/andriy/Desktop/GSB544_ML/Week_7/AmesHousing.csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 71727.5698733 , -19400.88589971])

In [3]:
y_preds = lr_fitted.predict(X_test)

r2_score(y_test, y_preds)

-2041426.2152879094

In [4]:
y_preds[1:5]
# the estimates are made based on non standardized values

array([8.61165890e+07, 1.01792001e+08, 1.23864692e+08, 9.94055903e+07])

In [5]:
# 13.2.1.2 Chapter Two

new_house = pd.DataFrame(data = {"Gr Liv Area": [889], "TotRms AbvGrd": [6]})
new_house

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,889,6


In [6]:
new_house_s = (new_house - new_house.mean())/new_house.std()
new_house_s
# don't standardzie by it's own value, use same standardization as in training process

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,,


In [7]:
# 13.2.1.3 The Moral of the Story
# standardize test predictions same as in training process

X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.48925953696804303

In [8]:
# standardzie new house value with training process steps

new_house_s = (new_house - X_train.mean())/X_train.std()
lr_fitted.predict(new_house_s)

array([97249.34960767])

we used X_train.mean() and X_train.std() in each case: we “learned” our estimates for the mean and sd of the columns when we fit the model, and we use those for all future predictions

In [9]:
# 13.2.2 Pipeline Objects - name the steps to avoid errors

lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

Pay careful attention to the use of [ and ( inside the Pipeline function. The function takes a list ([]) of steps; each step may be put into a tuple () with a name of your choice.

In [10]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.48925953696804314

In [11]:
lr_pipeline_fitted.predict(new_house)

array([97249.34960767])

In [12]:
# 13.2.3 Column Transformers

from sklearn.compose import ColumnTransformer

# define 'ct' process with ColumnTransformer() function first, ahead of the pipeline, to dummify only specific columns with OHE (otherwise if used inside the pipeline, transforms all the columns to dummies):

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

# then call 'ct' as a 'preprocessing' step in the pipeline, before lr:

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [13]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)

In [14]:
lr_fitted

In [15]:
X.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,Gd,Attchd,1960.0,Fin,2.0,528.0,TA,TA,P,210,62,0,0,0,0,,,,0,5,2010,WD,Normal
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,TA,Attchd,1968.0,Fin,2.0,522.0,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal


In [16]:
y.head()

0    215000
1    105000
2    172000
3    244000
4    189900
Name: SalePrice, dtype: int64

In [17]:
# 13.2.3.1 Checking preprocessing

ct_fitted = ct.fit(X_train)


In [18]:
ct.transform(X_test)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        , -1.11201015, -1.5634701 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,  0.13230212, -0.91949863],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,  0.9312185 ,  1.01241577],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        , -0.39298541, -0.27552716],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,  1.18886904,  2.30035871],
       [ 1.        ,  0.        ,  0.        , ...,  0.        , -0.07341885,  1.01241577]])

In [19]:
# 13.2.4 Challenges of pipelines
# 13.2.4.1 Extracting information

# to extract model coef from a pipeline, refer to a specific step of the pipeline:
lr_pipeline_fitted.named_steps['linear_regression'].coef_

array([ 71711.24403495, -19396.47008404])

In [20]:
# 13.2.4.2 Pandas input, numpy output:
# Pipelines can take input and process inputs in pandas 'df' objects, but the outputs would be in numpy 'arrays'.

# use 'set_output() method on a pipeline, to transform output object to 'pandas'

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


ct.fit_transform(X_train).head()

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
95,0.0,0.0,0.0,0.0,1.0,-0.584725,-1.56347
76,1.0,0.0,0.0,0.0,0.0,0.703527,1.012416
857,1.0,0.0,0.0,0.0,0.0,-0.44891,-0.919499
922,1.0,0.0,0.0,0.0,0.0,-0.438923,-0.919499
1670,1.0,0.0,0.0,0.0,0.0,-1.155951,-1.56347


Notice that in this transformed dataset, the column names now have prefixes for the named steps in the column transformer.

Notice also the structure of the names of the dummified variables:

[step name]__[variable name]_[category]

In [21]:
# 13.2.4.3 Interactions and Dummies

# to add interactions to the model specification inside a 'pipeline', use 'PolynomialFeatures() function, but keep only 'interaction_only=True', without squares or cubes (if not needed).

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter.fit_transform(X_train).head()

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
95,1.0,1200.0,4.0,4800.0
76,1.0,1845.0,8.0,14760.0
857,1.0,1268.0,5.0,6340.0
922,1.0,1273.0,5.0,6365.0
1670,1.0,914.0,4.0,3656.0


In [22]:
# use sequential 'ct' steps for preprocessing 'dummies' first

ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

# and then add ct for 'interaction' using new 'dummified and remainder' variables names

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified.head()

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,remainder__Lot Area,remainder__Street,remainder__Alley,remainder__Lot Shape,remainder__Land Contour,remainder__Utilities,remainder__Lot Config,remainder__Land Slope,remainder__Neighborhood,remainder__Condition 1,remainder__Condition 2,remainder__House Style,remainder__Overall Qual,remainder__Overall Cond,remainder__Year Built,remainder__Year Remod/Add,remainder__Roof Style,remainder__Roof Matl,remainder__Exterior 1st,remainder__Exterior 2nd,remainder__Mas Vnr Type,remainder__Mas Vnr Area,remainder__Exter Qual,remainder__Exter Cond,remainder__Foundation,remainder__Bsmt Qual,remainder__Bsmt Cond,remainder__Bsmt Exposure,remainder__BsmtFin Type 1,remainder__BsmtFin SF 1,remainder__BsmtFin Type 2,remainder__BsmtFin SF 2,remainder__Bsmt Unf SF,remainder__Total Bsmt SF,remainder__Heating,remainder__Heating QC,remainder__Central Air,remainder__Electrical,remainder__1st Flr SF,remainder__2nd Flr SF,remainder__Low Qual Fin SF,remainder__Gr Liv Area,remainder__Bsmt Full Bath,remainder__Bsmt Half Bath,remainder__Full Bath,remainder__Half Bath,remainder__Bedroom AbvGr,remainder__Kitchen AbvGr,remainder__Kitchen Qual,remainder__TotRms AbvGrd,remainder__Functional,remainder__Fireplaces,remainder__Fireplace Qu,remainder__Garage Type,remainder__Garage Yr Blt,remainder__Garage Finish,remainder__Garage Cars,remainder__Garage Area,remainder__Garage Qual,remainder__Garage Cond,remainder__Paved Drive,remainder__Wood Deck SF,remainder__Open Porch SF,remainder__Enclosed Porch,remainder__3Ssn Porch,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
95,0.0,0.0,0.0,0.0,1.0,96,533210060,160,FV,30.0,3215,Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,2Story,7,5,2004,2004,Gable,CompShg,MetalSd,MetalSd,BrkFace,120.0,Gd,TA,PConc,Gd,TA,Av,GLQ,280.0,Unf,0.0,320.0,600.0,GasA,Ex,Y,SBrkr,600,600,0,1200,0.0,0.0,2,1,2,1,Gd,4,Typ,0,,Detchd,2004.0,RFn,2.0,480.0,TA,TA,Y,0,172,0,0,0,0,,,,0,4,2010,ConLD,Normal
76,1.0,0.0,0.0,0.0,0.0,77,531451280,60,RL,70.0,11218,Pave,,Reg,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,2Story,6,5,1992,1992,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,1055.0,1055.0,GasA,Ex,Y,SBrkr,1055,790,0,1845,0.0,0.0,2,1,3,1,Gd,8,Typ,1,TA,Attchd,1992.0,RFn,2.0,462.0,TA,TA,Y,635,104,0,0,0,0,,GdPrv,Shed,400,5,2010,WD,Normal
857,1.0,0.0,0.0,0.0,0.0,858,907202130,20,RL,,9286,Pave,,IR1,Lvl,AllPub,CulDSac,Mod,CollgCr,Norm,Norm,1Story,5,7,1977,1989,Gable,CompShg,HdBoard,Plywood,,0.0,TA,TA,CBlock,Gd,Gd,Av,ALQ,196.0,Unf,0.0,1072.0,1268.0,GasA,TA,Y,SBrkr,1268,0,0,1268,0.0,0.0,1,1,3,1,Gd,5,Typ,0,,Detchd,1978.0,Unf,1.0,252.0,TA,TA,Y,173,0,0,0,0,0,,,,0,10,2009,WD,Normal
922,1.0,0.0,0.0,0.0,0.0,923,909277090,20,RL,80.0,14680,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Story,5,4,1960,1960,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,793.0,Unf,0.0,480.0,1273.0,GasA,Ex,Y,SBrkr,1273,0,0,1273,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1960.0,Unf,1.0,307.0,TA,TA,Y,483,0,0,0,115,0,,MnPrv,,0,6,2009,WD,Normal
1670,1.0,0.0,0.0,0.0,0.0,1671,527425025,20,RL,,17199,Pave,,Reg,Lvl,AllPub,FR2,Gtl,NAmes,Norm,Norm,1Story,4,7,1961,1961,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,Gd,BrkTil,TA,TA,No,ALQ,314.0,Unf,0.0,600.0,914.0,GasA,Ex,Y,SBrkr,914,0,0,914,0.0,0.0,1,0,2,1,TA,4,Typ,0,,Basment,1961.0,Unf,1.0,270.0,Fa,TA,Y,140,0,0,0,0,0,,GdWo,,0,7,2007,WD,Normal


In [23]:
ct_inter.fit_transform(X_train_dummified).head()

Unnamed: 0,interaction__1,interaction__remainder__TotRms AbvGrd,interaction__dummify__Bldg Type_1Fam,interaction__remainder__TotRms AbvGrd dummify__Bldg Type_1Fam
95,1.0,4.0,0.0,0.0
76,1.0,8.0,1.0,8.0
857,1.0,5.0,1.0,5.0
922,1.0,5.0,1.0,5.0
1670,1.0,4.0,1.0,4.0


## 13.2.5 Your turn

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

Steps:

0. load data
1. set Test/Train datasets
2. CT for dummies, standardzie, interaction, poly
3. pipeline
4. .fit the pipeline on (X_train, y_train)
5. .predict

In [24]:
# Specify the X any variables ahead of splitting the data

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

In [25]:
# Prior step: from sklearn.model_selection use 'train_test_split' to split the dataset to test and train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [26]:
# Pipeline - Model 1: 'size and number of rooms'

ct_1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 2: create pipeline for the model
lr_pipe_1 = Pipeline([
    ("Column Transformer", ct_1),
    ("Linear Regression", LinearRegression())
    ])

# Step 4: train the model
lr_fitted_1 = lr_pipe_1.fit(X_train, y_train)

# Step 5: test the model
y_pred_1 = lr_fitted_1.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_1 = r2_score(y_test, y_pred_1)
MSE_1 = mean_squared_error(y_test, y_pred_1)

In [27]:
# Pipeline - Model 2: ["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]

# Step 2: use 'ct' to dummify variables from Step 1
ct_2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 3: create pipeline for the model
lr_pipe_2 = Pipeline([
    ("Column Transformer", ct_2),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_2 = lr_pipe_2.fit(X_train, y_train)

# Step 5: test the model
y_pred_2 = lr_fitted_2.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_2 = r2_score(y_test, y_pred_2)
MSE_2 = mean_squared_error(y_test, y_pred_2)

In [28]:
# view dummies names for next model
ct_2.fit_transform(X_train).head()

Unnamed: 0,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
1947,0.0,0.0,0.0,0.0,-1.219074,-1.553967
1993,1.0,0.0,0.0,0.0,1.621505,2.251419
2098,0.0,0.0,0.0,0.0,0.282236,0.348726
13,0.0,0.0,0.0,0.0,-0.322974,-0.919736
2319,0.0,0.0,0.0,0.0,0.575079,0.982957


In [29]:
# Pipeline - Model 3: 'size', dummified 'building type' and their interactions

# Step 2: use 'ct' to dummify variables from Step 1
ct_3_dum = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


ct_3_inter = ColumnTransformer(
  [
    ("interaction1", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


# Step 3: create pipeline for the model
lr_pipe_3 = Pipeline([
    ("CT_dum", ct_3_dum),
    ("CT_inter", ct_3_inter),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_3 = lr_pipe_3.fit(X_train, y_train)

# Step 5: test the model
y_pred_3 = lr_fitted_3.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_3 = r2_score(y_test, y_pred_3)
MSE_3 = mean_squared_error(y_test, y_pred_3)

In [30]:

# Pipeline - Model 4: using a 5-degree polynomial on size "Gr Liv Area", a 5-degree polynomial on number of rooms "TotRms AbvGrd", and also building type.
# standardize__Gr Liv Area
# standardize__TotRms AbvGrd

# Step 2: use 'ct' to dummify variables from Step 1
ct_4_dum = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


ct_4_poly = ColumnTransformer(
  [
    ("degree5_LivArea", PolynomialFeatures(degree = 5), ["standardize__Gr Liv Area"]),
    ("degree5_NRooms", PolynomialFeatures(degree = 5), ["standardize__TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


# Step 3: create pipeline for the model
lr_pipe_4 = Pipeline([
    ("CT_dum", ct_4_dum),
    ("CT_inter", ct_4_poly),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_4 = lr_pipe_4.fit(X_train, y_train)

# Step 5: test the model
y_pred_4 = lr_fitted_4.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_4 = r2_score(y_test, y_pred_4)
MSE_4 = mean_squared_error(y_test, y_pred_4)

In [31]:
print(RSQ_1), print(MSE_1)
print(RSQ_2), print(MSE_2)
print(RSQ_3), print(MSE_3)
print(RSQ_4), print(MSE_4)

0.5514349030200313
2984166334.6128507
0.5669434873753496
2880992468.339632
0.5790796096295492
2800254560.491787
0.5622300397325329
2912349593.2482653
0.5514349030200313
2984166334.6128507
0.5669434873753496
2880992468.339632
0.5790796096295492
2800254560.491787
0.5622300397325329
2912349593.2482653


(None, None)

Answer 13.2.5:
Model 3 works best, whith highest R-squared and lowest MSE

In [32]:
# 13.3 Cross-Validation

from sklearn.model_selection import cross_val_score

In [33]:
# PA 13.3.1 cross_val_score

m1_score_r2 = (cross_val_score(lr_pipe_1, X, y, cv=5, scoring='r2')).mean()
m1_score_mse = (cross_val_score(lr_pipe_1, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m2_score_r2 = (cross_val_score(lr_pipe_2, X, y, cv=5, scoring='r2')).mean()
m2_score_mse = (cross_val_score(lr_pipe_2, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m3_score_r2 = (cross_val_score(lr_pipe_3, X, y, cv=5, scoring='r2')).mean()
m3_score_mse = (cross_val_score(lr_pipe_3, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m4_score_r2 = (cross_val_score(lr_pipe_4, X, y, cv=5, scoring='r2')).mean()
m4_score_mse = (cross_val_score(lr_pipe_4, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

In [34]:
# print the resuls

print("Model 1:")
print(m1_score_r2)
print(m1_score_mse)

print("Model 2:")
print(m2_score_r2)
print(m2_score_mse)

print("Model 3:")
print(m3_score_r2)
print(m3_score_mse)

print("Model 4:")
print(m4_score_r2)
print(m4_score_mse)

Model 1:
0.504208752508862
3136138908.170903
Model 2:
0.5328824390692035
2951993958.10073
Model 3:
0.544480259714528
2873848290.133327
Model 4:
0.49270516755799665
3208990074.221526
Model 1:
0.504208752508862
3136138908.170903
Model 2:
0.5328824390692035
2951993958.10073
Model 3:
0.544480259714528
2873848290.133327
Model 4:
0.49270516755799665
3208990074.221526


Answer 13.3 Cross-Validation:
Model 3 still has the lowest MSE and the highest R-squared of all four models.

## 13.3.2 Tuning
from sklearn.model_selection import GridSearchCV

The name of the list of numbers in our dictionary object was preprocessing__polynomial__degree. This follows the pattern

[name of step in pipeline]__[name of step in column transformer]__[name of argument to function]

In [56]:
# from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [57]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.00639687, 0.00559564, 0.00559549, 0.0053515 , 0.00558548, 0.00580101, 0.00589299, 0.00652719, 0.00644774]),
 'std_fit_time': array([1.33169683e-03, 1.67617479e-04, 2.23595009e-04, 9.15720293e-05, 2.04445309e-04, 1.95599707e-04, 1.99804329e-04,
        4.68625263e-04, 2.91668992e-04]),
 'mean_score_time': array([0.00273161, 0.00247159, 0.00252275, 0.00236421, 0.00265636, 0.00248404, 0.00260258, 0.00288215, 0.00300798]),
 'std_score_time': array([4.14656929e-04, 1.13539684e-04, 2.38580239e-04, 2.60696953e-05, 2.74687905e-04, 8.48329891e-05, 2.47073071e-04,
        1.42204849e-04, 1.79688894e-04]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': np.int64(1)},
  {'preprocessing__polynomial__degree': np.int64(2)},
  {'preprocessing__polynomial__degree': np.int64(

In [58]:
# filtering results for the metric of interest for all of the models

gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123637,  0.5420654 ,  0.45186012,  0.33383744,  0.02932172, -0.96809593, -4.54559303])

In [59]:
# creating a dataframe with the results

pd.DataFrame(data = {"degrees": np.arange(1, 10), "R2 mean scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,R2 mean scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.542065
4,5,0.45186
5,6,0.333837
6,7,0.029322
7,8,-0.968096
8,9,-4.545593


In [42]:
# How many different model fitting steps occurred when gscv.fit(X, y) was run?

cv=5
poly=9
cv*poly

45

## 13.3.3 Your Turn
Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [60]:
# from sklearn.model_selection import GridSearchCV # assisted by ChatGPT

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_area", PolynomialFeatures(), ["Gr Liv Area"]), 
    ("polynomial_rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {
  'preprocessing__polynomial_area__degree': np.arange(1, 10),
  'preprocessing__polynomial_rooms__degree': np.arange(1, 10)
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [61]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.00721402, 0.00668373, 0.00655928, 0.00664382, 0.00677524, 0.00683608, 0.00722866, 0.00724444, 0.00779185, 0.00706654,
        0.00754271, 0.00736976, 0.00694814, 0.00683789, 0.00714722, 0.00769448, 0.00797653, 0.00872087, 0.00732474, 0.00744863,
        0.00721235, 0.00697494, 0.00711794, 0.00712695, 0.00772481, 0.00795135, 0.00814986, 0.00728579, 0.00751691, 0.00770516,
        0.00785799, 0.00789599, 0.01009231, 0.00772138, 0.00782037, 0.00798578, 0.00697708, 0.00704741, 0.00767531, 0.00780039,
        0.00866823, 0.00783491, 0.007972  , 0.00784883, 0.00805154, 0.00706329, 0.00711613, 0.00730653, 0.00804305, 0.00798979,
        0.0081768 , 0.00812855, 0.00842319, 0.00849266, 0.00739923, 0.00788727, 0.00802131, 0.00809307, 0.00813146, 0.00785546,
        0.00781975, 0.00811391, 0.00841436, 0.00755415, 0.00762839, 0.00749078, 0.00775781, 0.00797172, 0.00773973, 0.00810566,
        0.00794764, 0.00804958, 0.0074965 , 0.00748606, 0.00815864, 0.00783529, 0.00781

In [62]:
gscv_fitted.cv_results_['mean_test_score']

array([ 0.53288244,  0.53238285,  0.53592417,  0.54152875,  0.54106618,  0.53486226,  0.08006934, -1.09028712,  0.27015518,
        0.53747194,  0.53356735,  0.53413413,  0.5354176 ,  0.53026731,  0.53331356,  0.35249861, -0.17703818,  0.49127632,
        0.55764061,  0.55685722,  0.55403905,  0.55039238,  0.54654916,  0.54517075,  0.44839955,  0.42970266, -1.04472851,
        0.5495276 ,  0.55021754,  0.55044544,  0.55680714,  0.55643404,  0.55328571,  0.55409729,  0.27868162,  0.03893532,
        0.45186012,  0.45186012,  0.50523029,  0.49656245,  0.49269454,  0.52242916,  0.51785313,  0.43585646,  0.41957692,
        0.33383744,  0.33383744,  0.33383744,  0.33383744,  0.33383744,  0.48645229,  0.49189613,  0.31212193,  0.25841666,
        0.02932175,  0.02932175,  0.02932175,  0.02932175,  0.02932175,  0.02932175,  0.02932175,  0.02932172,  0.36288119,
       -0.96809568, -0.96809568, -0.96809568, -0.96809568, -0.96809568, -0.96809568, -0.96809568, -0.96809568, -0.96809568,
       -

In [66]:
# dataframe of the resuls #assisted by ChatGPT
cv_results = gscv_fitted.cv_results_

# Create a DataFrame using the mean_test_score and parameter columns
results_df = pd.DataFrame({
    'degree_polynomial_area': cv_results['param_preprocessing__polynomial_area__degree'],
    'degree_polynomial_rooms': cv_results['param_preprocessing__polynomial_rooms__degree'],
    'mean_test_score': cv_results['mean_test_score']
})

# Sort the DataFrame by mean_test_score in descending order to show the best results first
results_df = results_df.sort_values(by='mean_test_score', ascending=False)

results_df

Unnamed: 0,degree_polynomial_area,degree_polynomial_rooms,mean_test_score
18,3,1,0.557641
19,3,2,0.556857
30,4,4,0.556807
31,4,5,0.556434
33,4,7,0.554097
...,...,...,...
76,9,5,-4.545593
77,9,6,-4.545593
78,9,7,-4.545593
72,9,1,-4.545593


Answers:

1. The best model accorind to mean_test_score is Model 18: with degree_polynomial_area of '3' and degree_polynomial_rooms '1'. The mean_test_score is 0.557641.

2. The model becomes cumbersome and so does the output. Basically after the 18th model tested, there was no goin in the r-squared from the following degrees in model. To make this process more efficient, it may have sense to start with a smaller range of polynomial degree eg. 1-5 and then see if the results started decreasing at any point of increasing polynomial. Another option would we to include R-squared check step into the pipeline and let the model simulation 'break' once the next iteration of degrees doesn't yield higher R-squared eg. with logical function if().