In [1]:
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline

linear_model = make_pipeline(StandardScaler(), LinearRegression())
tree_model = DecisionTreeRegressor()

linear_results = cross_validate(estimator=linear_model, X=data_numerical, y=target , cv=10)
tree_results = cross_validate(estimator=tree_model, X=data_numerical, y=target , cv=10)

In [4]:
print(f"linear accuracy is {linear_results['test_score'].mean():.2f} +/- {linear_results['test_score'].std():.2f}\n"
      f"tree accuracy is {tree_results['test_score'].mean():.2f} +/- {tree_results['test_score'].std():.2f}")

linear accuracy is 0.72 +/- 0.14
tree accuracy is 0.62 +/- 0.11


In [5]:
from sklearn.model_selection import GridSearchCV
import numpy as np
depths = np.linspace(start=1,stop=15,num=15)

param_grid = {
    'max_depth': depths}
model_grid_search = GridSearchCV(tree_model, param_grid=param_grid,
                                 n_jobs=2, cv=10)
model_grid_search.fit(data_numerical, target)

accuracy = model_grid_search.score(data_numerical, target)
print(
    f"The test accuracy score of the grid-searched pipeline is: "
    f"{accuracy:.2f}"
)
model_grid_search.best_params_

The test accuracy score of the grid-searched pipeline is: 0.86


{'max_depth': 6.0}

In [6]:
import numpy as np
from sklearn.model_selection import GridSearchCV

params = {"max_depth": np.arange(1, 16)}
search = GridSearchCV(tree_model, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)
import numpy as np
from sklearn.model_selection import GridSearchCV

params = {"max_depth": np.arange(1, 16)}
search = GridSearchCV(tree_model, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)
for search_cv in cv_results_tree_optimal_depth["estimator"]:
    print(search_cv.best_params_)
print(f"tree accuracy is {cv_results_tree_optimal_depth['test_score'].mean():.2f} +/- {cv_results_tree_optimal_depth['test_score'].std():.2f}\n")

{'max_depth': 6}
{'max_depth': 7}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 9}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 8}
tree accuracy is 0.68 +/- 0.07



In [7]:
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,4,2010,WD,Normal


In [8]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocessor = make_column_transformer(
    (categorical_processor, selector(dtype_include=object)),
    ("passthrough", selector(dtype_exclude=object))
)
tree = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7, random_state=0))
cv_results = cross_validate(
    tree, data, target, cv=10, return_estimator=True, n_jobs=2
)
cv_results["test_score"].mean()
print(
    "A tree model using both numerical and categorical features is better than a "
    "tree with optimal depth using only numerical features for "
    f"{sum(cv_results['test_score'] > cv_results_tree_optimal_depth['test_score'])} CV "
    "iterations out of 10 folds."
)

A tree model using both numerical and categorical features is better than a tree with optimal depth using only numerical features for 8 CV iterations out of 10 folds.
