In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
lr = LinearRegression()


ames = pd.read_csv("AmesHousing.csv")
x = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(x, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 71269.00445054, -18772.64477639])

In [9]:
x = ames[["Gr Liv Area", "Bldg Type", "TotRms AbvGrd"]]
y = ames["SalePrice"]

ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
  remainder = "drop"
).set_output(transform = "pandas")

ct_p = ColumnTransformer(
  [
    ("p1", PolynomialFeatures(degree = 5, interaction_only = False), ["standardize__Gr Liv Area"]),
    ("p2", PolynomialFeatures(degree = 5, interaction_only = False), ["standardize__TotRms AbvGrd"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
  ("preprocessing2", ct_p),
  ("linear_regression", LinearRegression())]
)

scores = cross_val_score(lr_pipeline, x, y, cv=5, scoring='neg_root_mean_squared_error')
print(abs(scores.mean()))

55176.9659433821


In [25]:
from sklearn.model_selection import GridSearchCV

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')


In [24]:

X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]

ct_poly = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly1", PolynomialFeatures(), ["Gr Liv Area"]),
        ("poly2", PolynomialFeatures(), ["TotRms AbvGrd"])
    ],
    remainder="drop"
)

lr_pipeline_poly = Pipeline([
    ("preprocessing", ct_poly),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

param_grid = {
    'preprocessing__poly1__degree': np.arange(1, 11),
    'preprocessing__poly2__degree': np.arange(1, 11)
}

gscv = GridSearchCV(lr_pipeline_poly, param_grid, cv=5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.02072825, 0.02073417, 0.02154837, 0.02228842, 0.02289085,
        0.02528815, 0.02127047, 0.02637348, 0.05013485, 0.05381017,
        0.0311038 , 0.02204351, 0.02159243, 0.02423892, 0.02590775,
        0.02100172, 0.0559299 , 0.07008324, 0.02994895, 0.03519158,
        0.03425827, 0.02066488, 0.02299848, 0.02193022, 0.02130713,
        0.03841333, 0.07409992, 0.06297436, 0.05585027, 0.06914983,
        0.02646575, 0.02085261, 0.02068486, 0.02169952, 0.02755165,
        0.02883964, 0.04643517, 0.03657956, 0.12971644, 0.13184485,
        0.05622959, 0.04353328, 0.11653013, 0.07855945, 0.07767196,
        0.06527672, 0.07521696, 0.0401371 , 0.03790879, 0.03151994,
        0.02855139, 0.02212129, 0.02831297, 0.03791842, 0.04620671,
        0.04204445, 0.05368781, 0.06197734, 0.06039085, 0.05711274,
        0.04053941, 0.03238673, 0.05088415, 0.06389198, 0.04057441,
        0.03522267, 0.04316759, 0.03986011, 0.04054341, 0.07379231,
        0.07477784, 0.07336373,

In [26]:
gscv_fitted.cv_results_['mean_test_score']


degrees = pd.DataFrame(gscv_fitted.cv_results_["params"])
results = degrees.assign(scores=gscv_fitted.cv_results_['mean_test_score'])
results.sort_values(by='scores', ascending=False)

Unnamed: 0,preprocessing__poly1__degree,preprocessing__poly2__degree,scores
20,3,1,0.557641
33,4,4,0.556932
21,3,2,0.556857
34,4,5,0.556414
22,3,3,0.554039
...,...,...,...
98,10,9,-16.188760
99,10,10,-16.188760
9,1,10,-184.221203
19,2,10,-189.473656
