# Housing Data 5: Categorical Encoding

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

In [3]:
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

files_id = {
    'housing_data':"1Z6M90HU3Dx-qF_eHLxLASpCEEjGj9bEL"#,
    #'songs_1000':"1ANRezff5WhbAgQH1qo7csdUdHVMDjXe_",
    #'songs_5000': "1yvsAK-4k7t3gESXBBiRdZV0ec9wTMZVZ"

}

housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")
#songs_1000 = pd.read_csv(gd_path(files_id['songs_1000']), sep=",")
#songs_5000 = pd.read_csv(gd_path(files_id['songs_5000']), sep=",")

Data Description
1. `MSZoning`: Identifies the general zoning classification of the sale.
2. `Condition1`: Proximity to various conditions.
3. `Heating`: Type of heating.
4. `Street`: Type of road access to property.
5. `CentralAir`: Central air conditioning.
6. `Foundation`: Type of foundation.
7. `ExterQual`: Evaluates the quality of the material on the exterior.
8. `ExterCond`: Evaluates the present condition of the material on the exterior.
9. `BsmtQual`: Evaluates the height of the basement.
10. `BsmtCond`: Evaluates the general condition of the basement.
11. `BsmtExposure`: Refers to walkout or garden level walls.
12. `BsmtFinType1`: Rating of basement finished area.
13. `KitchenQual`: Kitchen quality.
14. `FireplaceQu`: Fireplace quality.
15. `MSSubClass`: Identifies the type of dwelling involved in the sale.
16. `OverallQual`: Rates the overall material and finish of the house.
17. `OverallCond`: Rates the overall condition of the house.
18. `YearBuilt`: Original construction date.
19. `YearRemodAdd`: Remodel date (same as construction date if no remodeling or additions).
20. `MasVnrArea`: Masonry veneer area in square feet.
21. `BsmtFinSF1`: Type 1 finished square feet.
22. `BsmtFinSF2`: Type 2 finished square feet.
23. `BsmtUnfSF`: Unfinished square feet of basement area.
24. `1stFlrSF`: First Floor square feet.
25. `2ndFlrSF`: Second floor square feet.
26. `LowQualFinSF`: Low quality finished square feet (all floors).
27. `GrLivArea`: Above grade (ground) living area square feet.
28. `BsmtFullBath`: Basement full bathrooms.
29. `BsmtHalfBath`: Basement half bathrooms.
30. `FullBath`: Full bathrooms above grade.
31. `HalfBath`: Half baths above grade.
32. `KitchenAbvGr`: Kitchens above grade.
33. `TotRmsAbvGrd`: Total rooms above grade (does not include bathrooms).
34. `GarageYrBlt`: Year garage was built.
35. `GarageArea`: Size of garage in square feet.
36. `OpenPorchSF`: Open porch area in square feet.
37. `EnclosedPorch`: Enclosed porch area in square feet.
38. `3SsnPorch`: Three season porch area in square feet.
39. `MiscVal`: $Value of miscellaneous feature.
40. `MoSold`: Month Sold (MM).
41. `YrSold`: Year Sold (YYYY).


In [2]:
housing_data.describe()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,70.049958,1057.429452,2.866438,0.613014,2.758904,1.767123,94.244521,15.060959,0.14863,...,1.046575,6.517808,1978.506164,472.980137,46.660274,21.95411,3.409589,43.489041,6.321918,2007.815753
std,9981.264932,24.284752,438.705324,0.815778,0.644666,40.177307,0.747315,125.338794,55.757415,0.355845,...,0.220338,1.625393,24.689725,213.804841,66.256028,61.119149,29.317331,496.123024,2.703626,1.328095
min,1300.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1900.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,7553.5,59.0,795.75,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,5.0,1961.0,334.5,0.0,0.0,0.0,0.0,5.0,2007.0
50%,9478.5,69.0,991.5,3.0,1.0,0.0,2.0,0.0,0.0,0.0,...,1.0,6.0,1980.0,480.0,25.0,0.0,0.0,0.0,6.0,2008.0
75%,11601.5,80.0,1298.25,3.0,1.0,0.0,2.0,168.0,0.0,0.0,...,1.0,7.0,2002.0,576.0,68.0,0.0,0.0,0.0,8.0,2009.0
max,215245.0,313.0,6110.0,8.0,3.0,738.0,4.0,857.0,480.0,1.0,...,3.0,14.0,2010.0,1418.0,547.0,552.0,508.0,15500.0,12.0,2010.0


In [4]:
housing_data.sample(5)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
2,11250,68.0,920,3,1,0,2,0,0,0,...,1,6,2001.0,608,42,0,0,0,9,2008
1241,9849,83.0,1689,3,0,0,3,0,0,0,...,1,7,2007.0,954,56,0,0,0,6,2007
48,4456,33.0,736,2,0,0,0,0,0,0,...,3,8,,0,0,102,0,0,6,2009
572,13159,83.0,846,3,0,0,2,208,0,0,...,1,6,2009.0,650,114,0,0,0,7,2009
676,9600,60.0,1095,4,0,0,3,0,0,0,...,2,8,1920.0,779,0,90,0,0,5,2006


## One Hot encoding

Fitting the `OneHotEncoder`

As with any transformer, we have to:
1. Import it
2. Initialize it
3. Fit it to the data
4. Use it to transform the data

In [13]:
# import
from sklearn.preprocessing import OneHotEncoder

# initialize
my_onehot = OneHotEncoder(drop="first",sparse_output=False).set_output(transform='pandas')

# fit
my_onehot.fit(X_cat_imputed)

# transform
X_cat_imputed_onehot = my_onehot.transform(X_cat_imputed)

NOTE: If we leave `sparse_output=True`, the result will be a "sparse matrix": an object that Scikit-Learn creates when a matrix contains mostly zeros. In that case we would not be able to use `.set_output(transform='pandas')`.

In [14]:
X_cat_imputed_onehot.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,BsmtFinType1_Rec,BsmtFinType1_Unf,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_N_A,FireplaceQu_Po,FireplaceQu_TA
772,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
157,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
360,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
744,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
150,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Split the Data

In [6]:
X = housing_data
y = housing_data.pop('Expensive')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

### Replacing NaNs in categorical features

We were imputing the mean to NaN’s on our preprocessing pipeline for numerical features. There's a problem with categorical values: they don’t have a “mean”. Here, we will replace NaNs with a string that marks them: “N_A”. It is not an elegant solution, but it will allow us to move forward.

#### Categorical features

In [10]:
# selecting non-numerical columns
X_train_cat = X_train.select_dtypes(exclude="number")

# defining the imputer to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy="constant",
                            fill_value="N_A").set_output(transform='pandas')

# fitting and transforming
X_cat_imputed = cat_imputer.fit_transform(X_train_cat)

X_cat_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
772,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,Av,ALQ,TA,TA
157,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Ex,TA,No,Unf,Ex,Gd
360,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,Gd,TA,Av,GLQ,TA,TA
744,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,TA
150,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,Av,BLQ,TA,N_A


#### Numerical features

In [11]:
# Selecting numerical columns
X_train_num = X_train.select_dtypes(include="number")

# Imputing the mean
num_imputer = SimpleImputer(strategy="mean").set_output(transform='pandas')

# Fitting and transforming
X_num_imputed = num_imputer.fit_transform(X_train_num)

X_num_imputed.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSSubClass,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
772,7819.0,94.0,1029.0,3.0,1.0,0.0,2.0,144.0,0.0,80.0,...,1.0,6.0,1976.0,672.0,0.0,0.0,0.0,0.0,3.0,2010.0
157,12003.0,92.0,774.0,4.0,1.0,0.0,3.0,0.0,0.0,60.0,...,1.0,8.0,2009.0,680.0,75.0,0.0,0.0,0.0,5.0,2010.0
360,7540.0,70.10352,888.0,2.0,1.0,0.0,2.0,0.0,192.0,85.0,...,1.0,5.0,1978.0,470.0,0.0,0.0,0.0,0.0,6.0,2007.0
744,5395.0,41.0,1337.0,2.0,1.0,0.0,2.0,96.0,0.0,120.0,...,1.0,5.0,1993.0,462.0,0.0,70.0,168.0,0.0,10.0,2008.0
150,10356.0,120.0,969.0,3.0,0.0,0.0,2.0,0.0,0.0,20.0,...,1.0,5.0,1975.0,440.0,0.0,0.0,0.0,0.0,1.0,2007.0


# Concatenating all columns

In [12]:
# Concatenating all columns
X_imputed = pd.concat([X_cat_imputed, X_num_imputed], axis=1)

X_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
772,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,...,1.0,6.0,1976.0,672.0,0.0,0.0,0.0,0.0,3.0,2010.0
157,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Ex,TA,...,1.0,8.0,2009.0,680.0,75.0,0.0,0.0,0.0,5.0,2010.0
360,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,Gd,TA,...,1.0,5.0,1978.0,470.0,0.0,0.0,0.0,0.0,6.0,2007.0
744,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,1.0,5.0,1993.0,462.0,0.0,70.0,168.0,0.0,10.0,2008.0
150,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,...,1.0,5.0,1975.0,440.0,0.0,0.0,0.0,0.0,1.0,2007.0


## Pipeline Enocoding

In [16]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False)
)

### 3.2. Using `ColumnTransformer` a pipeline with 2 branches (the `preprocessor`)

We simply tell the pipeline the following:

- One branch, called `"num_pipe"`, will apply the steps in the `numeric_pipe` to the columns named in `X_num_columns`
- The second branch, called `"cat_pipe"`, will apply the steps in the `categoric_pipe` to the columns named in `X_cat_columns`

In [17]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

## Creating the `full_pipeline` (`preprocessor` + Decision Tree)

In [18]:
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier()).set_output(transform='pandas')

We can then fit this `full_pipeline` to the data:

Note: notice that we did not fit the `preprocessor` before —we only fit the pipeline once it has been full assembled.

In [19]:
full_pipeline.fit(X_train, y_train)

This full pipeline can make predictions, as any othet pipeline that ends with a model:

In [20]:
full_pipeline.predict(X_train)

array([0, 1, 0, ..., 0, 1, 0])

## Accessing steps of the pipeline

The estimators of a pipeline are stored as a list in the steps attribute, but can be accessed by index or name by indexing the Pipeline:

In [21]:
# first step in our pipeline is Simpleimputer

# we can access it with an index
full_pipeline[0]

In [22]:
# or by it's name
full_pipeline['columntransformer']

`named_steps` helps you navigate through the pipeline.   
When using it, you get the autocompletion active after every step.  
With `get_features_names_out()' you can get printed column names that specific estimator used

In [23]:
(
full_pipeline                     # Start with the full_pipeline
 .named_steps.columntransformer   # .named_steps will call “coloumntransformer” which is a ColumnTransformer
 .named_transformers_.cat_pipe    # .named_transformer_ will call “cat_pipe” which is a pipeline inside “coloumntransformer
 .named_steps.simpleimputer       # The second .named_steps will call “simpleimputer” which is the desired step.
 .get_feature_names_out()         # .get_feature_names_out() will get us the fetures that got into that desired step
)

array(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu'],
      dtype=object)

We can also get parameters out of any specific step e.g. catogeries from onehotencoder step:

In [24]:
full_pipeline.named_steps.columntransformer.named_transformers_.cat_pipe.named_steps.onehotencoder.categories_

[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['GasA', 'GasW', 'Grav', 'OthW', 'Wall'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'N_A', 'TA'], dtype=object),
 array(['Fa', 'Gd', 'N_A', 'Po', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'N_A', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'N_A', 'Rec', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'N_A', 'Po', 'TA'], dtype=object)]