# 1. Data reading & splitting

In [1]:
import pandas as pd
from sklearn import set_config
set_config(transform_output='pandas')

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# reading
url = "https://drive.google.com/file/d/1Oz-3SydbpviwluuD26hc-M4YrvMtdRnI/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

# X and y creation
X = data.copy()
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [8]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
318,9900,90.0,1347,4,1,0,3,340,0,RL,Norm,GasA,Pave,Y,PConc
580,14585,,1144,3,2,0,2,216,0,RL,Norm,GasA,Pave,Y,CBlock
961,12227,,1330,4,1,0,2,550,0,RL,PosN,GasA,Pave,Y,CBlock
78,10778,72.0,1768,4,0,0,0,0,0,RL,Norm,GasA,Pave,N,CBlock
5,14115,85.0,796,1,0,0,2,40,0,RL,Norm,GasA,Pave,Y,Wood


In [13]:
data['Foundation'].unique()

array(['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'], dtype=object)

# Dealing with NaNs (Imputer)

In [14]:
#Categorical data
X_train_cat= X_train.select_dtypes(exclude='number')
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat_imputed = cat_imputer.fit_transform(X_train_cat)

#Numerical data
X_train_num = X_train.select_dtypes(include='number')
num_imputer = SimpleImputer(strategy='mean')
X_num_imputed = num_imputer.fit_transform(X_train_num)

# 2. OneHot Encoding (Categorical encoding)

## "Manual" **approach**

Fitting the `OneHotEncoder`

We have to:
1. Import it
2. Initialize it
3. Fit it to the data
4. Use it to transform the data

In [21]:
X_cat_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
318,RL,Norm,GasA,Pave,Y,PConc
580,RL,Norm,GasA,Pave,Y,CBlock
961,RL,PosN,GasA,Pave,Y,CBlock
78,RL,Norm,GasA,Pave,N,CBlock
5,RL,Norm,GasA,Pave,Y,Wood


In [22]:
from sklearn.preprocessing import OneHotEncoder

# initialize
my_onehot = OneHotEncoder(sparse_output=False,
                          handle_unknown='ignore')

# fit
my_onehot.fit(X_cat_imputed)

# transform
X_cat_imputed_onehot = my_onehot.transform(X_cat_imputed)

In [23]:
X_cat_imputed_onehot.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,...,Street_Grvl,Street_Pave,CentralAir_N,CentralAir_Y,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
318,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
580,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
961,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
78,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
X_cat_imputed_onehot.shape

(1168, 30)

In [29]:
X_num_imputed.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
318,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0
580,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0
961,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0
78,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0
5,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0


In [27]:
X_imputed= pd.concat([X_num_imputed,
                       X_cat_imputed_onehot],
                      axis=1)
X_imputed.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning_C (all),...,Street_Grvl,Street_Pave,CentralAir_N,CentralAir_Y,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
318,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
580,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
961,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
78,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
X_imputed.shape

(1168, 39)

## "Automated" **approach** (Using Pipelines)

In the manual approach, to encode the categorical columns numerically, we have:

1. Selected the categorical columns.
2. Fitted a `OneHotEncoder` to them.
3. Transformed the categorical columns with the encoder.
4. Converted the sparse matrix into a dataframe.
5. Recovered the names of the columns.
6. Concatenated the one-hot columns with the numerical columns.

All these steps can be synthetised by using Scikit-Learn Pipelines and specifically something called `ColumnTransformer`, which allows us to apply different transformations to two or more groups of columns: in our case, categorical and numerical columns.

This process is also called creating "branches" in the pipeline. One branch for the categorical columns and another for the numerical columns. Each branch will contain as many transformers as we want. Then, the branches will meet again, and the transformed columns will be automatically concatenated.

In [30]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant",
                  fill_value="N_A"),
    OneHotEncoder(sparse_output=False,
                  handle_unknown='ignore')
)

In [32]:
categoric_pipe

In [33]:
numeric_pipe

Using `make_column_transformer` for a pipeline with 2 branches (the `preprocessor`)

We simply tell the pipeline the following:

- One branch will apply the steps in the `numeric_pipe` to the columns named in `X_num_columns`
- The second branch will apply the steps in the `categoric_pipe` to the columns named in `X_cat_columns`

In [34]:
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (numeric_pipe, X_num_columns),
    (categoric_pipe, X_cat_columns)
)

In [35]:
preprocessor

Creating the `full_pipeline` (`preprocessor` + Decision Tree)

Pipelines are modular. The `preprocessor` we created above with the `make_column_transformer` can become now a step in a new pipeline, that we'll call `full_pipeline` and will include, as a last step, a Decision Tree model:

In [36]:
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier())

In [37]:
full_pipeline

In [38]:
#we only fit the pipeline once it has been fully assembled.
full_pipeline.fit(X_train, y_train)

In [39]:
pred_train= full_pipeline.predict(X_train)

In [40]:
accuracy_score(y_train, pred_train)

1.0

In [41]:
pred_test= full_pipeline.predict(X_test)

In [42]:
accuracy_score(y_test, pred_test)

0.8835616438356164

# Accessing steps of the pipeline

In [44]:
# first step in our pipeline is ColumnTransformer

# we can access it with an index
full_pipeline[0]

In [47]:
# or by it's name
full_pipeline['columntransformer']

In [52]:
full_pipeline.named_steps.columntransformer.named_transformers_['pipeline-2'].named_steps['simpleimputer']

In [53]:
full_pipeline.named_steps.columntransformer.named_transformers_['pipeline-1'].named_steps['simpleimputer'].statistics_

array([1.03530342e+04, 6.95842697e+01, 1.06113784e+03, 2.87157534e+00,
       6.05308219e-01, 3.44863014e+00, 1.75941781e+00, 9.70890411e+01,
       1.42636986e+01])

In [54]:
(full_pipeline
 .named_steps.columntransformer
 .named_transformers_['pipeline-1']
 .named_steps.simpleimputer
 .get_feature_names_out()
 )

array(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr',
       'Fireplaces', 'PoolArea', 'GarageCars', 'WoodDeckSF',
       'ScreenPorch'], dtype=object)

In [55]:
(full_pipeline
 .named_steps.columntransformer
 .named_transformers_['pipeline-2']
 .named_steps.onehotencoder
 .get_feature_names_out()
 )

array(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'Condition1_Artery', 'Condition1_Feedr',
       'Condition1_Norm', 'Condition1_PosA', 'Condition1_PosN',
       'Condition1_RRAe', 'Condition1_RRAn', 'Condition1_RRNe',
       'Condition1_RRNn', 'Heating_Floor', 'Heating_GasA', 'Heating_GasW',
       'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'Street_Grvl',
       'Street_Pave', 'CentralAir_N', 'CentralAir_Y', 'Foundation_BrkTil',
       'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab',
       'Foundation_Stone', 'Foundation_Wood'], dtype=object)

In [56]:
(full_pipeline
 .named_steps.columntransformer
 .named_transformers_['pipeline-2']
 .named_steps.onehotencoder
 .categories_
)

[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], dtype=object)]

# Including the imputer to the GridSearchCV

In [57]:
from sklearn.model_selection import GridSearchCV

# define parameter grid
param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median","most_frequent"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

best_param = search.best_params_

best_param

Fitting 5 folds for each of 90 candidates, totalling 450 fits


{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 9}

In [58]:
search.best_score_

np.float64(0.9186786985070248)

The main distinction between `Pipeline` and `make_pipeline` is that you can name the steps yourself. The same is true with `ColumnTransformer`. When we then want to access certain parts of our pipeline we use those custom names instead.

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Creating the same pipeline but with custom step-names:

numeric_pipe = Pipeline([
    ('num-imputer', SimpleImputer())
])

categoric_pipe = Pipeline([
    ('cat-imputer', SimpleImputer(strategy='constant', fill_value='N_A')),
    ('oh-encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num-pipe', numeric_pipe, X_num_columns),
    ('cat-pipe', categoric_pipe, X_cat_columns)
])

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier())
])

# This can make the parameter grid more readable
param_grid = {
    "preprocessor__num-pipe__num-imputer__strategy":["mean", "median"],
    "preprocessor__cat-pipe__cat-imputer__strategy":["constant", "most_frequent"],
    "tree__max_depth": range(2, 14, 2),
    "tree__min_samples_leaf": range(3, 12, 2),
    "tree__strategy": ["gini","entropy"]
}

# And accessing the steps more readable as well
full_pipeline.fit(X_train, y_train)
full_pipeline['preprocessor']['cat-pipe']['oh-encoder'].categories_

[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], dtype=object)]