In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [6]:
ames = pd.read_csv("/Users/ethanschultz/Downloads/AmesHousing.csv")
# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [7]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [16]:
ames_transformed = ct.transform(ames)
ames_transformed_pd = pd.DataFrame(ames_transformed)
ames_transformed_pd
# see how it is setting 0's for values in the handdle_unknown = "ignore"

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.725612,-0.990784,-0.876581,2.733119,-0.099163,-0.523168,-0.379275,-1.185621,0.043852,0.403708,-0.298871,-0.303455,0.001523,1.275602,-0.790867,-0.101362,0.302978,1.054795,-0.254295,-1.020424,-0.764668,0.174527,-0.18248,0.356458,2.144773,0.298606,0.246615,0.896016,0.207098,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-0.444404,1.675421,0.408859
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.724432,-0.990525,-0.876581,0.189832,-0.823877,0.377843,-0.346343,-1.137311,-0.575949,0.028103,0.539233,-0.697098,-0.484153,-0.676675,-0.790867,-0.101362,-1.202580,-0.841685,-0.254295,-1.020424,-0.764668,-1.050202,-0.18248,-0.921265,-0.936666,-1.023793,1.189503,0.346175,-0.713033,-0.356731,-0.103521,1.842908,-0.061399,-0.087930,-0.076545,1.675421,-0.970882
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.723252,-0.990520,-0.876581,0.523711,-0.099163,0.377843,-0.445140,-1.282241,0.021716,1.027520,-0.298871,-0.384025,0.612298,0.435609,-0.790867,-0.101362,-0.344808,-0.841685,-0.254295,-1.020424,1.223468,0.174527,-0.18248,-0.282404,-0.936666,-1.023793,-0.761621,2.333460,-0.178763,-0.356731,-0.103521,-0.288151,-0.061399,21.738194,-0.076545,1.675421,-0.130494
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.722072,-0.990509,-0.876581,0.131514,0.625551,-0.523168,-0.115817,-0.799142,-0.575949,1.339426,-0.298871,1.086954,2.528022,2.441831,-0.790867,-0.101362,1.202351,1.054795,-0.254295,0.794592,1.223468,0.174527,-0.18248,0.995320,2.144773,0.298606,0.218609,-0.753509,-0.713033,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-0.812263,1.675421,0.772609
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.720892,-0.986524,0.057181,0.468549,-0.823877,-0.523168,0.839218,0.650154,-0.575949,0.737580,-0.298871,-1.003264,-0.371319,-0.594474,0.845154,-0.101362,0.249491,-0.841685,-0.254295,0.794592,1.223468,0.174527,-0.18248,-0.282404,0.604054,0.298606,0.031898,0.911726,-0.208445,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-1.180122,1.675421,0.094028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2816,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.725742,1.113110,0.524062,-0.275326,-0.099163,0.377843,0.411099,-0.026184,-0.575949,0.799082,-0.298871,-0.895070,-0.187351,-0.401815,-0.790867,-0.101362,-0.990613,1.054795,-0.254295,-1.020424,-0.764668,0.174527,-0.18248,-0.282404,-0.936666,0.298606,0.526681,0.189077,-0.713033,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-1.180122,-1.358188,-0.500515
2817,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.726922,1.113115,-0.876581,-0.155660,-0.823877,-0.523168,0.378166,-0.074494,-0.575949,-0.338716,1.586862,-0.768460,-0.528305,-0.661263,-0.790867,-0.101362,-1.190694,1.054795,-0.254295,-1.020424,-0.764668,-1.050202,-0.18248,-0.921265,-0.936666,0.298606,0.041234,0.534692,-0.713033,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-0.076545,-1.358188,-0.644761
2818,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.728102,1.113772,0.640782,0.040754,-0.823877,-0.523168,0.674557,0.360295,-0.575949,-0.259641,-0.298871,0.005013,-0.410566,-0.486585,-0.790867,-0.101362,-1.055986,-0.841685,3.753333,-1.020424,-0.764668,0.174527,-0.18248,-0.282404,-0.936666,-2.346193,-2.217962,-0.125118,-0.238126,-0.356731,-0.103521,-0.288151,-0.061399,1.134333,0.291314,-1.358188,-0.632218
2819,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.729282,1.117482,-0.876581,-0.013651,-0.823877,-0.523168,0.081776,-0.460973,-0.575949,1.352605,0.417009,-0.869748,0.759473,0.589736,-0.790867,-0.101362,-0.225948,1.054795,-0.254295,-1.020424,-0.764668,-1.050202,-0.18248,-0.282404,0.604054,0.298606,-0.266838,1.131663,-0.149082,-0.356731,-0.103521,-0.288151,-0.061399,-0.087930,-0.812263,-1.358188,-0.155580


1. With handle_unknown='ignore'
Setting handle_unknown='ignore' changes this behavior:
The encoder will ignore any unknown categories in the new data, not causing an error.
For rows with unknown categories, the encoder will set all the encoded dummy variables for that feature to 0. This effectively creates a new, implicit "unknown" category for any values it hasn't seen before.

2. make_column_selector(dtype_include=np.number) automatically selects numeric columns.

3. remainder="passthrough": Any columns not explicitly transformed will be left as-is and passed through to the output.

4. The make_column_selector(dtype_include=object) part of the code automatically selects columns with a data type of object (typically used for categorical data in pandas) within the ColumnTransformer, instead of listing out all the variabels we want to dummify.

In [8]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')
# super bad r^2 values because we overfit the data

array([-1.00227561e+21, -2.13473460e+19, -4.65481157e+21, -4.24892786e+21,
       -4.16001805e+22])