In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
plt.style.use("dark_background")

In [2]:
data = pd.read_csv(
    "houseprice.csv",
    usecols=["Neighborhood", "Exterior1st", "Exterior2nd", "SalePrice"],
)

data.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [3]:
for col in data.columns:
    print(col, data[col].nunique())

Neighborhood 25
Exterior1st 15
Exterior2nd 16
SalePrice 663


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data[["Neighborhood", "Exterior1st", "Exterior2nd"]],
    data["SalePrice"], 
    test_size=0.3,  
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [5]:
from feature_engine.encoding import OneHotEncoder
ohe = OneHotEncoder(top_categories=5,
                    variables=["Neighborhood", "Exterior1st"],
                    drop_last=False)
ohe.fit(X_train)

In [6]:
ohe.encoder_dict_

{'Neighborhood': ['NAmes', 'CollgCr', 'OldTown', 'Edwards', 'Sawyer'],
 'Exterior1st': ['VinylSd', 'HdBoard', 'Wd Sdng', 'MetalSd', 'Plywood']}

In [7]:
ohe.transform(X_train)

Unnamed: 0,Exterior2nd,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Exterior1st_VinylSd,Exterior1st_HdBoard,Exterior1st_Wd Sdng,Exterior1st_MetalSd,Exterior1st_Plywood
64,VinylSd,0,1,0,0,0,1,0,0,0,0
682,Wd Sdng,0,0,0,0,0,0,0,1,0,0
960,Plywood,0,0,0,0,0,0,0,1,0,0
1384,Wd Shng,0,0,0,1,0,0,0,0,0,0
1100,Wd Sdng,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
763,VinylSd,0,0,0,0,0,1,0,0,0,0
835,HdBoard,0,0,0,0,1,1,0,0,0,0
1216,VinylSd,0,0,0,0,1,1,0,0,0,0
559,VinylSd,0,0,0,0,0,1,0,0,0,0


In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe_sk = OneHotEncoder(handle_unknown="infrequent_if_exist",max_categories=5,sparse_output=False).set_output(transform="pandas")
ohe_sk.fit(X_train)

In [9]:
ohe_sk.infrequent_categories_

[array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'Crawfor',
        'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NPkVill', 'NWAmes',
        'NoRidge', 'NridgHt', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst',
        'StoneBr', 'Timber', 'Veenker'], dtype=object),
 array(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
        'ImStucc', 'Plywood', 'Stone', 'Stucco', 'WdShing'], dtype=object),
 array(['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd',
        'ImStucc', 'Other', 'Plywood', 'Stone', 'Stucco', 'Wd Shng'],
       dtype=object)]

In [10]:
ohe_sk.transform(X_train)

Unnamed: 0,Neighborhood_CollgCr,Neighborhood_Edwards,Neighborhood_NAmes,Neighborhood_OldTown,Neighborhood_infrequent_sklearn,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_infrequent_sklearn,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_infrequent_sklearn
64,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
682,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
960,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1384,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1100,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
835,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1216,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
559,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
ohe_sk.transform(X_test)

Unnamed: 0,Neighborhood_CollgCr,Neighborhood_Edwards,Neighborhood_NAmes,Neighborhood_OldTown,Neighborhood_infrequent_sklearn,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_infrequent_sklearn,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_infrequent_sklearn
529,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
491,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
459,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
279,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
655,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
445,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
654,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1280,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
