In [1]:
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [4]:
data_numerical.head(5)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,65.0,8450,196.0,706,0,150,856,856,854,0,...,0,2,548,0,61,0,0,0,0,0
1,80.0,9600,0.0,978,0,284,1262,1262,0,0,...,1,2,460,298,0,0,0,0,0,0
2,68.0,11250,162.0,486,0,434,920,920,866,0,...,1,2,608,0,42,0,0,0,0,0
3,60.0,9550,0.0,216,0,540,756,961,756,0,...,1,3,642,0,35,272,0,0,0,0
4,84.0,14260,350.0,655,0,490,1145,1145,1053,0,...,1,3,836,192,84,0,0,0,0,0


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [6]:
tree_model = DecisionTreeRegressor(random_state=0)
lin_model = make_pipeline(StandardScaler(), LinearRegression())

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

splits = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

lin_results = cross_validate(lin_model, data_numerical, target, cv=splits)
tree_results = cross_validate(tree_model, data_numerical, target, cv=splits)

In [8]:
[lin_results['test_score'] > tree_results['test_score']]

[array([ True,  True,  True, False,  True,  True,  True,  True,  True,
         True])]

In [9]:
tree_results['test_score']

array([0.73388159, 0.44310091, 0.57429057, 0.76289602, 0.54701596,
       0.58315862, 0.58139412, 0.59956264, 0.69807404, 0.41666291])

In [10]:
from sklearn.model_selection import GridSearchCV
import numpy as np

inner_cv = 10
parameters = {'max_depth': np.arange(1, 16)}

grid_tree = GridSearchCV(tree_model, param_grid=parameters, cv=inner_cv)

In [11]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

splits = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
outer_cv = 10

grid_cv = cross_validate(grid_tree, X=data_numerical, y=target, 
                         cv=splits, return_estimator=True, n_jobs=2)

In [12]:
for estimator in grid_cv["estimator"]:
    print(estimator.best_estimator_)


DecisionTreeRegressor(max_depth=6, random_state=0)
DecisionTreeRegressor(max_depth=5, random_state=0)
DecisionTreeRegressor(max_depth=6, random_state=0)
DecisionTreeRegressor(max_depth=7, random_state=0)
DecisionTreeRegressor(max_depth=7, random_state=0)
DecisionTreeRegressor(max_depth=9, random_state=0)
DecisionTreeRegressor(max_depth=7, random_state=0)
DecisionTreeRegressor(max_depth=7, random_state=0)
DecisionTreeRegressor(max_depth=6, random_state=0)
DecisionTreeRegressor(max_depth=9, random_state=0)


In [13]:
[lin_results['test_score'] > grid_cv['test_score']]

[array([ True,  True,  True, False,  True,  True, False,  True,  True,
         True])]

In [14]:
lin_results['test_score']

array([0.78397103, 0.78315481, 0.7102129 , 0.69769855, 0.80289014,
       0.75229809, 0.62865532, 0.78115   , 0.73553133, 0.75677366])

In [15]:
grid_cv['test_score']

array([0.77756775, 0.67098003, 0.6721007 , 0.76127973, 0.66940623,
       0.55889929, 0.70725064, 0.63676675, 0.71702719, 0.55804201])

In [20]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [24]:
tree_model = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=0, max_depth=7))
lin_model = make_pipeline(preprocessor, LinearRegression())

In [26]:
from sklearn.model_selection import cross_validate

outer_cv = 10

treegrid_cv = cross_validate(tree_model, X=data, y=target, 
                         cv=outer_cv, return_estimator=True, n_jobs=2)

In [27]:
treegrid_cv['test_score']

array([0.77825504, 0.77252103, 0.84010287, 0.74975798, 0.83940989,
       0.7600146 , 0.7900183 , 0.81278988, 0.49241843, 0.75366731])

In [28]:
[treegrid_cv['test_score'] > grid_cv['test_score']]

[array([ True,  True,  True, False,  True,  True,  True,  True, False,
         True])]