In [24]:
#libraries
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [20]:
#read in data
#data
ames = pd.read_csv('data/AmesHousing.csv')

Consider four possible models for predicting house prices:

- Using only the size and number of rooms.
- Using size, number of rooms, and building type.
- Using size and building type, and their interaction.
- Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
- Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [None]:
#model1
#using only size and num rooms

#pick only LotArea and Bedroom
#X = ames.drop("SalePrice", axis = 1)
X = ames[["Lot Area", "Bedroom AbvGr"]]
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    #("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")



**DID this for cross_validation only the first 4 models though**

In [47]:
#model1
#using only size and num rooms

#pick only LotArea and Bedroom
#X = ames.drop("SalePrice", axis = 1)
X = ames[["Lot Area", "Bedroom AbvGr"]]
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    #("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")



scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()
#.062

0.06216223947149311

In [48]:
#model2
#using size, num rooms, building type

#pick only LotArea and Bedroom
#X = ames.drop("SalePrice", axis = 1)
X = ames[["Lot Area", "Bedroom AbvGr", "Bldg Type"]]
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()
#.11

0.11087724924265377

In [49]:
#model3
#using size, building type, interaction

#pick only LotArea and Bedroom
#X = ames.drop("SalePrice", axis = 1)
X = ames[["Lot Area", "Bldg Type"]]
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area"])
  ],
  remainder = "drop"
)

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Lot Area", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

#check to see what names the transformed variables are
#X_train_dummified = ct.fit_transform(X)
#X_train_dummified

scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()
#.052

0.0529075770615026

In [50]:
#model4
#Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

#pick only LotArea and Bedroom
#X = ames.drop("SalePrice", axis = 1)
X = ames[["Lot Area", "Bedroom AbvGr", "Bldg Type"]]
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

ct_degree = ColumnTransformer(
  [
    ("polynomial_lot", PolynomialFeatures(degree = 5), ["standardize__Lot Area"]),
    ("polynomial_bedroom", PolynomialFeatures(degree = 5), ["standardize__Bedroom AbvGr"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("poly", ct_degree),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

#check to see what names the transformed variables are
#X_train_dummified = ct.fit_transform(X)
#X_train_dummified

scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()
#2.29

-2.2990440872741846

**TUNING**

**Practice Activity**

Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [77]:
#TUNING helps us with only degrees, k's, but not recipes or models
from sklearn.model_selection import GridSearchCV

X = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = ames["SalePrice"]

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_size", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_bedroom", PolynomialFeatures(), ["TotRms AbvGrd"]),
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_size__degree': np.arange(1, 11),
            'preprocessing__polynomial_bedroom__degree': np.arange(1, 11)
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [78]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.00659461, 0.00404558, 0.00413618, 0.00402522, 0.00425181, 0.00426183,
        0.0042942 , 0.00483141, 0.00539985, 0.00511446, 0.00524678, 0.00528231,
        0.0063261 , 0.00476565, 0.00736871, 0.0052    , 0.00455866, 0.00510297,
        0.00520067, 0.00439715, 0.00339241, 0.00433397, 0.00480828, 0.00767369,
        0.00636439, 0.00461025, 0.00400667, 0.00381837, 0.00599322, 0.00656099,
        0.0041398 , 0.00637946, 0.00618396, 0.00562372, 0.00575476, 0.00477557,
        0.00556078, 0.00520439, 0.00686383, 0.00628133, 0.00677319, 0.00865827,
        0.00677218, 0.00876989, 0.00677199, 0.00449767, 0.00603328, 0.00535641,
        0.00792041, 0.00907955, 0.00610485, 0.00637727, 0.00700431, 0.00537744,
        0.00547442, 0.00649757, 0.00619912, 0.01041174, 0.00929785, 0.00632377,
        0.00802345, 0.01546564, 0.01097784, 0.00706892, 0.01011806, 0.01032653,
        0.0099277 , 0.00794578, 0.00625582, 0.00592675, 0.00547743, 0.00699229,
        0.00624247, 0.0

In [79]:
len(gscv_fitted.cv_results_['mean_test_score'])
gscv_fitted.cv_results_

{'mean_fit_time': array([0.00659461, 0.00404558, 0.00413618, 0.00402522, 0.00425181, 0.00426183,
        0.0042942 , 0.00483141, 0.00539985, 0.00511446, 0.00524678, 0.00528231,
        0.0063261 , 0.00476565, 0.00736871, 0.0052    , 0.00455866, 0.00510297,
        0.00520067, 0.00439715, 0.00339241, 0.00433397, 0.00480828, 0.00767369,
        0.00636439, 0.00461025, 0.00400667, 0.00381837, 0.00599322, 0.00656099,
        0.0041398 , 0.00637946, 0.00618396, 0.00562372, 0.00575476, 0.00477557,
        0.00556078, 0.00520439, 0.00686383, 0.00628133, 0.00677319, 0.00865827,
        0.00677218, 0.00876989, 0.00677199, 0.00449767, 0.00603328, 0.00535641,
        0.00792041, 0.00907955, 0.00610485, 0.00637727, 0.00700431, 0.00537744,
        0.00547442, 0.00649757, 0.00619912, 0.01041174, 0.00929785, 0.00632377,
        0.00802345, 0.01546564, 0.01097784, 0.00706892, 0.01011806, 0.01032653,
        0.0099277 , 0.00794578, 0.00625582, 0.00592675, 0.00547743, 0.00699229,
        0.00624247, 0.0

In [80]:
pd.DataFrame(data = {"degrees": np.arange(1, 101), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.532882
1,2,0.537472
2,3,0.557641
3,4,0.549425
4,5,0.451860
...,...,...
95,96,0.054012
96,97,0.400106
97,98,-0.968096
98,99,-4.545619


In [81]:
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,preprocessing__polynomial_bedroom__degree,preprocessing__polynomial_size__degree,scores
2,1,3,0.557641
33,4,4,0.556860
12,2,3,0.556857
43,5,4,0.556469
63,7,4,0.554093
...,...,...,...
19,2,10,-16.187891
9,1,10,-16.187891
90,10,1,-184.221209
91,10,2,-189.473641


**Conclusion**

Q1: The model with bedroom degree = 1 and size degree = 3 had the best R^2 score (which was .5576)

Q2: The main downside of trying all those values, is that it could take a while if there are a whole lot of combination. To extreme extent, it can be very expensive time and resources wise to run many models. And we can see that some of the really high degrees are very inefficient models so maybe there is a way to avoid that?