From coursebook: Ch 13

In [61]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# 13.2.1.1 Chapter One

lr = LinearRegression()


ames = pd.read_csv("/Users/andriy/Desktop/GSB544_ML/Week_7/AmesHousing.csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 71669.19207639, -17709.73682042])

In [3]:
y_preds = lr_fitted.predict(X_test)

r2_score(y_test, y_preds)

-2068715.2838655699

In [4]:
y_preds[1:5]
# the estimates are made based on non standardized values

array([8.22077281e+07, 1.07291945e+08, 1.33720167e+08, 1.07847589e+08])

In [5]:
# 13.2.1.2 Chapter Two

new_house = pd.DataFrame(data = {"Gr Liv Area": [889], "TotRms AbvGrd": [6]})
new_house

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,889,6


In [6]:
new_house_s = (new_house - new_house.mean())/new_house.std()
new_house_s
# don't standardzie by it's own value, use same standardization as in training process

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,,


In [7]:
# 13.2.1.3 The Moral of the Story
# standardize test predictions same as in training process

X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.4831575095213865

In [8]:
# standardzie new house value with training process steps

new_house_s = (new_house - X_train.mean())/X_train.std()
lr_fitted.predict(new_house_s)

array([100545.94549199])

we used X_train.mean() and X_train.std() in each case: we “learned” our estimates for the mean and sd of the columns when we fit the model, and we use those for all future predictions

In [9]:
# 13.2.2 Pipeline Objects - name the steps to avoid errors

lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

Pay careful attention to the use of [ and ( inside the Pipeline function. The function takes a list ([]) of steps; each step may be put into a tuple () with a name of your choice.

In [10]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.4831575095213865

In [11]:
lr_pipeline_fitted.predict(new_house)

array([100545.94549199])

In [12]:
# 13.2.3 Column Transformers

from sklearn.compose import ColumnTransformer

# define 'ct' process with ColumnTransformer() function first, ahead of the pipeline, to dummify only specific columns with OHE (otherwise if used inside the pipeline, transforms all the columns to dummies):

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

# then call 'ct' as a 'preprocessing' step in the pipeline, before lr:

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [13]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)

In [14]:
lr_fitted

In [15]:
X.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,Gd,Attchd,1960.0,Fin,2.0,528.0,TA,TA,P,210,62,0,0,0,0,,,,0,5,2010,WD,Normal
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,TA,Attchd,1968.0,Fin,2.0,522.0,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal


In [16]:
y.head()

0    215000
1    105000
2    172000
3    244000
4    189900
Name: SalePrice, dtype: int64

In [18]:
# 13.2.3.1 Checking preprocessing

ct_fitted = ct.fit(X_train)


In [21]:
ct.transform(X_test)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        , -0.89787307, -0.27028754],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,  0.69979962,  0.37237666],
       [ 0.        ,  1.        ,  0.        , ...,  0.        , -1.15838531, -0.91295174],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        , -0.88973206, -0.91295174],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,  0.52273271,  1.01504086],
       [ 1.        ,  0.        ,  0.        , ...,  0.        , -0.58851479, -0.27028754]])

In [22]:
# 13.2.4 Challenges of pipelines
# 13.2.4.1 Extracting information

# to extract model coef from a pipeline, refer to a specific step of the pipeline:
lr_pipeline_fitted.named_steps['linear_regression'].coef_

array([ 71652.87952535, -17705.70592545])

In [23]:
# 13.2.4.2 Pandas input, numpy output:
# Pipelines can take input and process inputs in pandas 'df' objects, but the outputs would be in numpy 'arrays'.

# use 'set_output() method on a pipeline, to transform output object to 'pandas'

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


ct.fit_transform(X_train).head()

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
26,1.0,0.0,0.0,0.0,0.0,-1.243866,-1.555616
2655,1.0,0.0,0.0,0.0,0.0,-0.67196,-0.270288
2302,1.0,0.0,0.0,0.0,0.0,-1.378193,-1.555616
1871,1.0,0.0,0.0,0.0,0.0,-0.814428,-0.270288
75,0.0,0.0,0.0,0.0,1.0,-1.160421,-0.912952


Notice that in this transformed dataset, the column names now have prefixes for the named steps in the column transformer.

Notice also the structure of the names of the dummified variables:

[step name]__[variable name]_[category]

In [25]:
# 13.2.4.3 Interactions and Dummies

# to add interactions to the model specification inside a 'pipeline', use 'PolynomialFeatures() function, but keep only 'interaction_only=True', without squares or cubes (if not needed).

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter.fit_transform(X_train).head()

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
26,1.0,882.0,4.0,3528.0
2655,1.0,1163.0,6.0,6978.0
2302,1.0,816.0,4.0,3264.0
1871,1.0,1093.0,6.0,6558.0
75,1.0,923.0,5.0,4615.0


In [26]:
# use sequential 'ct' steps for preprocessing 'dummies' first

ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

# and then add ct for 'interaction' using new 'dummified and remainder' variables names

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified.head()

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,remainder__Lot Area,remainder__Street,remainder__Alley,remainder__Lot Shape,remainder__Land Contour,remainder__Utilities,remainder__Lot Config,remainder__Land Slope,remainder__Neighborhood,remainder__Condition 1,remainder__Condition 2,remainder__House Style,remainder__Overall Qual,remainder__Overall Cond,remainder__Year Built,remainder__Year Remod/Add,remainder__Roof Style,remainder__Roof Matl,remainder__Exterior 1st,remainder__Exterior 2nd,remainder__Mas Vnr Type,remainder__Mas Vnr Area,remainder__Exter Qual,remainder__Exter Cond,remainder__Foundation,remainder__Bsmt Qual,remainder__Bsmt Cond,remainder__Bsmt Exposure,remainder__BsmtFin Type 1,remainder__BsmtFin SF 1,remainder__BsmtFin Type 2,remainder__BsmtFin SF 2,remainder__Bsmt Unf SF,remainder__Total Bsmt SF,remainder__Heating,remainder__Heating QC,remainder__Central Air,remainder__Electrical,remainder__1st Flr SF,remainder__2nd Flr SF,remainder__Low Qual Fin SF,remainder__Gr Liv Area,remainder__Bsmt Full Bath,remainder__Bsmt Half Bath,remainder__Full Bath,remainder__Half Bath,remainder__Bedroom AbvGr,remainder__Kitchen AbvGr,remainder__Kitchen Qual,remainder__TotRms AbvGrd,remainder__Functional,remainder__Fireplaces,remainder__Fireplace Qu,remainder__Garage Type,remainder__Garage Yr Blt,remainder__Garage Finish,remainder__Garage Cars,remainder__Garage Area,remainder__Garage Qual,remainder__Garage Cond,remainder__Paved Drive,remainder__Wood Deck SF,remainder__Open Porch SF,remainder__Enclosed Porch,remainder__3Ssn Porch,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
26,1.0,0.0,0.0,0.0,0.0,27,527404120,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Story,4,5,1970,1970,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,804.0,Rec,78.0,0.0,882.0,GasA,TA,Y,SBrkr,882,0,0,882,1.0,0.0,1,0,2,1,TA,4,Typ,0,,Attchd,1970.0,Fin,2.0,525.0,TA,TA,Y,240,0,0,0,0,0,,MnPrv,,0,4,2010,WD,Normal
2655,1.0,0.0,0.0,0.0,0.0,2656,902207080,20,RM,49.0,5820,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Story,3,8,1955,2005,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,ALQ,256.0,Unf,0.0,906.0,1162.0,GasA,Ex,Y,SBrkr,1163,0,0,1163,1.0,0.0,1,0,3,1,TA,6,Typ,0,,Attchd,1955.0,Unf,1.0,220.0,Fa,TA,Y,142,98,0,0,0,0,,,,0,7,2006,WD,Normal
2302,1.0,0.0,0.0,0.0,0.0,2303,923276180,20,RL,65.0,11625,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Story,5,7,1983,1983,Gable,CompShg,HdBoard,HdBoard,,0.0,TA,TA,CBlock,Gd,TA,No,ALQ,596.0,Unf,0.0,220.0,816.0,GasA,TA,Y,SBrkr,816,0,0,816,1.0,0.0,1,0,2,1,TA,4,Typ,0,,Attchd,1983.0,Fin,1.0,264.0,TA,TA,Y,330,0,0,0,0,0,,MnPrv,,0,5,2007,WD,Normal
1871,1.0,0.0,0.0,0.0,0.0,1872,534176140,20,RL,88.0,10738,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,Norm,Norm,1Story,6,7,1966,1966,Hip,CompShg,HdBoard,HdBoard,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,792.0,Unf,0.0,301.0,1093.0,GasA,Gd,Y,SBrkr,1093,0,0,1093,1.0,0.0,2,0,3,1,TA,6,Typ,1,Fa,Attchd,1966.0,RFn,2.0,484.0,TA,TA,Y,224,0,0,0,0,0,,MnPrv,Shed,400,11,2007,WD,Normal
75,0.0,0.0,0.0,0.0,1.0,76,531451110,120,RL,50.0,8012,Pave,,Reg,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,1Story,6,5,1980,1980,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,BLQ,543.0,BLQ,119.0,261.0,923.0,GasA,TA,Y,SBrkr,923,0,0,923,0.0,0.0,2,0,2,1,TA,5,Typ,1,TA,Attchd,1980.0,RFn,1.0,264.0,TA,TA,Y,80,0,0,0,0,0,,,,0,5,2010,WD,Normal


In [27]:
ct_inter.fit_transform(X_train_dummified).head()

Unnamed: 0,interaction__1,interaction__remainder__TotRms AbvGrd,interaction__dummify__Bldg Type_1Fam,interaction__remainder__TotRms AbvGrd dummify__Bldg Type_1Fam
26,1.0,4.0,1.0,4.0
2655,1.0,6.0,1.0,6.0
2302,1.0,4.0,1.0,4.0
1871,1.0,6.0,1.0,6.0
75,1.0,5.0,0.0,0.0


## 13.2.5 Your turn

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

Steps:

0. load data
1. set Test/Train datasets
2. CT for dummies, standardzie, interaction, poly
3. pipeline
4. .fit the pipeline on (X_train, y_train)
5. .predict

In [39]:
# Specify the X any variables ahead of splitting the data

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

In [40]:
# Prior step: from sklearn.model_selection use 'train_test_split' to split the dataset to test and train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [41]:
# Pipeline - Model 1: 'size and number of rooms'

ct_1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 2: create pipeline for the model
lr_pipe_1 = Pipeline([
    ("Column Transformer", ct_1),
    ("Linear Regression", LinearRegression())
    ])

# Step 4: train the model
lr_fitted_1 = lr_pipe_1.fit(X_train, y_train)

# Step 5: test the model
y_pred_1 = lr_fitted_1.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_1 = r2_score(y_test, y_pred_1)
MSE_1 = mean_squared_error(y_test, y_pred_1)

In [43]:
# Pipeline - Model 2: ["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]

# Step 2: use 'ct' to dummify variables from Step 1
ct_2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 3: create pipeline for the model
lr_pipe_2 = Pipeline([
    ("Column Transformer", ct_2),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_2 = lr_pipe_2.fit(X_train, y_train)

# Step 5: test the model
y_pred_2 = lr_fitted_2.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_2 = r2_score(y_test, y_pred_2)
MSE_2 = mean_squared_error(y_test, y_pred_2)

In [46]:
# view dummies names for next model
ct_2.fit_transform(X_train).head()

Unnamed: 0,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
1113,0.0,0.0,0.0,0.0,0.141383,0.338448
861,0.0,0.0,0.0,0.0,-1.441841,-0.916524
2067,0.0,0.0,0.0,0.0,-0.398086,-0.289038
609,0.0,0.0,0.0,0.0,1.239867,0.338448
1411,0.0,1.0,0.0,0.0,-1.254199,-1.54401


In [47]:
# Pipeline - Model 3: 'size', dummified 'building type' and their interactions

# Step 2: use 'ct' to dummify variables from Step 1
ct_3_dum = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


ct_3_inter = ColumnTransformer(
  [
    ("interaction1", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


# Step 3: create pipeline for the model
lr_pipe_3 = Pipeline([
    ("CT_dum", ct_3_dum),
    ("CT_inter", ct_3_inter),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_3 = lr_pipe_3.fit(X_train, y_train)

# Step 5: test the model
y_pred_3 = lr_fitted_3.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_3 = r2_score(y_test, y_pred_3)
MSE_3 = mean_squared_error(y_test, y_pred_3)

In [49]:

# Pipeline - Model 4: using a 5-degree polynomial on size "Gr Liv Area", a 5-degree polynomial on number of rooms "TotRms AbvGrd", and also building type.
# standardize__Gr Liv Area
# standardize__TotRms AbvGrd

# Step 2: use 'ct' to dummify variables from Step 1
ct_4_dum = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


ct_4_poly = ColumnTransformer(
  [
    ("degree5_LivArea", PolynomialFeatures(degree = 5), ["standardize__Gr Liv Area"]),
    ("degree5_NRooms", PolynomialFeatures(degree = 5), ["standardize__TotRms AbvGrd"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


# Step 3: create pipeline for the model
lr_pipe_4 = Pipeline([
    ("CT_dum", ct_4_dum),
    ("CT_inter", ct_4_poly),
    ("Linear Regression", LinearRegression())
  ])

# Step 4: train the model
lr_fitted_4 = lr_pipe_4.fit(X_train, y_train)

# Step 5: test the model
y_pred_4 = lr_fitted_4.predict(X_test)

# Step 6: estimate metrics for the model
RSQ_4 = r2_score(y_test, y_pred_4)
MSE_4 = mean_squared_error(y_test, y_pred_4)

In [50]:
print(RSQ_1), print(MSE_1)
print(RSQ_2), print(MSE_2)
print(RSQ_3), print(MSE_3)
print(RSQ_4), print(MSE_4)

0.5052968412023751
3333931304.9644423
0.5362781039657544
3125140639.3758006
0.5415228154554863
3089795185.217751
0.4839520045551803
3477779626.5931025
0.5052968412023751
3333931304.9644423
0.5362781039657544
3125140639.3758006
0.5415228154554863
3089795185.217751
0.4839520045551803
3477779626.5931025


(None, None)

Answer 13.2.5:
Model 3 works best, whith highest R-squared and lowest MSE

In [62]:
# 13.3 Cross-Validation

from sklearn.model_selection import cross_val_score

In [70]:
# PA 13.3.1 cross_val_score

m1_score_r2 = (cross_val_score(lr_pipe_1, X, y, cv=5, scoring='r2')).mean()
m1_score_mse = (cross_val_score(lr_pipe_1, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m2_score_r2 = (cross_val_score(lr_pipe_2, X, y, cv=5, scoring='r2')).mean()
m2_score_mse = (cross_val_score(lr_pipe_2, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m3_score_r2 = (cross_val_score(lr_pipe_3, X, y, cv=5, scoring='r2')).mean()
m3_score_mse = (cross_val_score(lr_pipe_3, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

m4_score_r2 = (cross_val_score(lr_pipe_4, X, y, cv=5, scoring='r2')).mean()
m4_score_mse = (cross_val_score(lr_pipe_4, X, y, cv=5, scoring=('neg_mean_squared_error'))*-1).mean()

In [71]:
# print the resuls

print("Model 1:")
print(m1_score_r2)
print(m1_score_mse)

print("Model 2:")
print(m2_score_r2)
print(m2_score_mse)

print("Model 3:")
print(m3_score_r2)
print(m3_score_mse)

print("Model 4:")
print(m4_score_r2)
print(m4_score_mse)

Model 1:
0.504208752508862
3136138908.170903
Model 2:
0.5328824390692035
2951993958.10073
Model 3:
0.544480259714528
2873848290.133327
Model 4:
0.49270516755799665
3208990074.221526
Model 1:
0.504208752508862
3136138908.170903
Model 2:
0.5328824390692035
2951993958.10073
Model 3:
0.544480259714528
2873848290.133327
Model 4:
0.49270516755799665
3208990074.221526


Answer 13.3 Cross-Validation:
Model 3 still has the lowest MSE and the highest R-squared of all four models.