In [42]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

data = pd.read_csv("houseprice.csv",
                   usecols=["Neighborhood",
                            "Exterior1st",
                            "Exterior2nd",
                            "SalePrice"])

data.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [43]:
for col in data.columns:
    print(col, ": ", len(data[col].unique()), " labels")

Neighborhood :  25  labels
Exterior1st :  15  labels
Exterior2nd :  16  labels
SalePrice :  663  labels


In [44]:
data["Neighborhood"].unique()

array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
       'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
       'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
       'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
       'Blueste'], dtype=object)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    data[["Neighborhood", "Exterior1st", "Exterior2nd"]],
    data["SalePrice"],
    test_size=0.3,
    random_state=0
)

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [46]:
pd.get_dummies(X_train, drop_first=True).shape

(1022, 53)

In [47]:
X_train["Neighborhood"].value_counts().sort_values(ascending=False).head(10)

Neighborhood
NAmes      151
CollgCr    105
OldTown     73
Edwards     71
Sawyer      61
Somerst     56
Gilbert     55
NWAmes      51
NridgHt     51
SawyerW     45
Name: count, dtype: int64

In [48]:
top10 = [x for x in X_train["Neighborhood"].value_counts().sort_values(ascending=False).head(10).index]
top10

['NAmes',
 'CollgCr',
 'OldTown',
 'Edwards',
 'Sawyer',
 'Somerst',
 'Gilbert',
 'NWAmes',
 'NridgHt',
 'SawyerW']

In [49]:
for label in top10:
    X_train["Neighborhood" + "_" + label] = np.where(
        X_train["Neighborhood"] == label, 1, 0
    )

    X_test["Neighborhood" + "_" + label] = np.where(
        X_test["Neighborhood"] == label, 1, 0
    )

# let's visualise the result
X_train[["Neighborhood"] + ["Neighborhood" + "_" + c for c in top10]].head(10)

Unnamed: 0,Neighborhood,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NWAmes,Neighborhood_NridgHt,Neighborhood_SawyerW
64,CollgCr,0,1,0,0,0,0,0,0,0,0
682,ClearCr,0,0,0,0,0,0,0,0,0,0
960,BrkSide,0,0,0,0,0,0,0,0,0,0
1384,Edwards,0,0,0,1,0,0,0,0,0,0
1100,SWISU,0,0,0,0,0,0,0,0,0,0
416,Sawyer,0,0,0,0,1,0,0,0,0,0
1034,Crawfor,0,0,0,0,0,0,0,0,0,0
853,NAmes,1,0,0,0,0,0,0,0,0,0
472,Edwards,0,0,0,1,0,0,0,0,0,0
1011,Edwards,0,0,0,1,0,0,0,0,0,0


In [50]:
def calculate_top_categories(df, variable, how_many=10):
    return [
        x
        for x in df[variable]
        .value_counts()
        .sort_values(ascending=False)
        .head(how_many)
        .index
    ]


def one_hot_encode(train, test, variable, top_x_labels):

    for label in top_x_labels:
        train[variable + "_" + label] = np.where(train[variable] == label, 1, 0)

        test[variable + "_" + label] = np.where(test[variable] == label, 1, 0)

In [51]:
for var in ["Exterior1st", "Exterior2nd"]:
    top_categories = calculate_top_categories(X_train, var, how_many=10)
    one_hot_encode(X_train, X_test, var, top_categories)


In [52]:
X_test[X_test["Exterior1st"] == "VinylSd"]

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,...,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_BrkFace,Exterior2nd_AsbShng,Exterior2nd_Stucco
1403,Somerst,VinylSd,VinylSd,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1317,Somerst,VinylSd,VinylSd,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1003,NWAmes,VinylSd,VinylSd,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1300,Gilbert,VinylSd,VinylSd,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
89,CollgCr,VinylSd,VinylSd,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,NAmes,VinylSd,VinylSd,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
320,NridgHt,VinylSd,VinylSd,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
686,Somerst,VinylSd,VinylSd,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1280,CollgCr,VinylSd,VinylSd,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [28]:
np.where(
            X_test["Neighborhood"] == "VinylSd", 1, 0
        )

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
X_test

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NWAmes,Neighborhood_NridgHt,Neighborhood_SawyerW
529,Crawfor,Wd Sdng,Stone,0,0,0,0,0,0,0,0,0,0
491,NAmes,Wd Sdng,Wd Sdng,1,0,0,0,0,0,0,0,0,0
459,BrkSide,MetalSd,MetalSd,0,0,0,0,0,0,0,0,0,0
279,ClearCr,Plywood,Plywood,0,0,0,0,0,0,0,0,0,0
655,BrDale,HdBoard,ImStucc,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,ClearCr,Plywood,Plywood,0,0,0,0,0,0,0,0,0,0
445,Edwards,Wd Sdng,Wd Sdng,0,0,0,1,0,0,0,0,0,0
654,NoRidge,MetalSd,MetalSd,0,0,0,0,0,0,0,0,0,0
1280,CollgCr,VinylSd,VinylSd,0,1,0,0,0,0,0,0,0,0


In [55]:
# Using sklearn

from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("houseprice.csv",
                   usecols=["Neighborhood",
                            "Exterior1st",
                            "Exterior2nd",
                            "SalePrice"])

data.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [56]:
ohe_enc = OneHotEncoder(
    handle_unknown="infrequent_if_exist",
    max_categories=5,
    sparse_output=False
)

ohe_enc.set_output(transform="pandas")

ohe_enc.fit(X_train)

In [57]:
ohe_enc.infrequent_categories_

[array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'Crawfor',
        'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NPkVill', 'NWAmes',
        'NoRidge', 'NridgHt', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst',
        'StoneBr', 'Timber', 'Veenker'], dtype=object),
 array(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
        'ImStucc', 'Plywood', 'Stone', 'Stucco', 'WdShing'], dtype=object),
 array(['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd',
        'ImStucc', 'Other', 'Plywood', 'Stone', 'Stucco', 'Wd Shng'],
       dtype=object),
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [58]:
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)

X_train.head()

Unnamed: 0,Neighborhood_CollgCr,Neighborhood_Edwards,Neighborhood_NAmes,Neighborhood_OldTown,Neighborhood_infrequent_sklearn,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_infrequent_sklearn,...,Exterior2nd_CmentBd_0,Exterior2nd_CmentBd_1,Exterior2nd_Wd Shng_0,Exterior2nd_Wd Shng_1,Exterior2nd_BrkFace_0,Exterior2nd_BrkFace_1,Exterior2nd_AsbShng_0,Exterior2nd_AsbShng_1,Exterior2nd_Stucco_0,Exterior2nd_Stucco_1
64,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
682,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
960,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1384,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1100,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
