# Housing Data 6: Categorical Encoding

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

In [13]:
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

files_id = {
    'housing_data':"1VEpP7kLJjlLR9MyTgu2FyFnOJArh6U2b"#,
    #'songs_1000':"1ANRezff5WhbAgQH1qo7csdUdHVMDjXe_",
    #'songs_5000': "1yvsAK-4k7t3gESXBBiRdZV0ec9wTMZVZ"

}

housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")
#songs_1000 = pd.read_csv(gd_path(files_id['songs_1000']), sep=",")
#songs_5000 = pd.read_csv(gd_path(files_id['songs_5000']), sep=",")


In [14]:
X = housing_data
y = housing_data.pop('Expensive')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [17]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False)
)

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

## Creating the `full_pipeline` (`preprocessor` + Decision Tree)

In [18]:
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier()).set_output(transform='pandas')

We can then fit this `full_pipeline` to the data:

Note: notice that we did not fit the `preprocessor` before —we only fit the pipeline once it has been full assembled.

In [19]:
full_pipeline.fit(X_train, y_train)

This full pipeline can make predictions, as any othet pipeline that ends with a model:

In [20]:
full_pipeline.predict(X_train)

array([0, 1, 0, ..., 0, 1, 0])

## Accessing steps of the pipeline

The estimators of a pipeline are stored as a list in the steps attribute, but can be accessed by index or name by indexing the Pipeline:

In [21]:
# first step in our pipeline is Simpleimputer

# we can access it with an index
full_pipeline[0]

In [22]:
# or by it's name
full_pipeline['columntransformer']

`named_steps` helps you navigate through the pipeline.   
When using it, you get the autocompletion active after every step.  
With `get_features_names_out()' you can get printed column names that specific estimator used

In [23]:
(
full_pipeline                     # Start with the full_pipeline
 .named_steps.columntransformer   # .named_steps will call “coloumntransformer” which is a ColumnTransformer
 .named_transformers_.cat_pipe    # .named_transformer_ will call “cat_pipe” which is a pipeline inside “coloumntransformer
 .named_steps.simpleimputer       # The second .named_steps will call “simpleimputer” which is the desired step.
 .get_feature_names_out()         # .get_feature_names_out() will get us the fetures that got into that desired step
)

array(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu'],
      dtype=object)

We can also get parameters out of any specific step e.g. catogeries from onehotencoder step:

In [24]:
full_pipeline.named_steps.columntransformer.named_transformers_.cat_pipe.named_steps.onehotencoder.categories_

[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['GasA', 'GasW', 'Grav', 'OthW', 'Wall'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'N_A', 'TA'], dtype=object),
 array(['Fa', 'Gd', 'N_A', 'Po', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'N_A', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'N_A', 'Rec', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'N_A', 'Po', 'TA'], dtype=object)]