In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

In [2]:
data = pd.read_csv(
    "houseprice.csv",
    usecols=["Neighborhood", "Exterior1st", "Exterior2nd", "SalePrice"],
)

data.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [3]:
for col in data.columns:
    print(col, " : ",data[col].nunique(),"labels")

Neighborhood  :  25 labels
Exterior1st  :  15 labels
Exterior2nd  :  16 labels
SalePrice  :  663 labels


In [4]:
data.isnull().sum()

Neighborhood    0
Exterior1st     0
Exterior2nd     0
SalePrice       0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # predictors
    data["SalePrice"],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0,
)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [6]:
cat_vars = list(X_train.select_dtypes(exclude ="number"))
cat_vars

['Neighborhood', 'Exterior1st', 'Exterior2nd']

In [7]:
ord_enc = OrdinalEncoder()
ct = ColumnTransformer([
    ("oe",ord_enc,cat_vars)
],remainder="passthrough"
).set_output(transform="pandas")

In [8]:
ct.fit_transform(X_train)

Unnamed: 0,oe__Neighborhood,oe__Exterior1st,oe__Exterior2nd
64,5.0,12.0,13.0
682,4.0,13.0,14.0
960,3.0,13.0,10.0
1384,7.0,14.0,15.0
1100,18.0,13.0,14.0
...,...,...,...
763,15.0,12.0,13.0
835,19.0,12.0,6.0
1216,19.0,12.0,13.0
559,0.0,12.0,13.0


In [10]:
from feature_engine.encoding import OrdinalEncoder
ord_enc = OrdinalEncoder(
    encoding_method="arbitrary",
    variables=["Neighborhood","Exterior1st"]
)
ord_enc.fit_transform(X_train)

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,0,0,VinylSd
682,1,1,Wd Sdng
960,2,1,Plywood
1384,3,2,Wd Shng
1100,4,1,Wd Sdng
...,...,...,...
763,18,0,VinylSd
835,5,0,HdBoard
1216,5,0,VinylSd
559,22,0,VinylSd


In [11]:
ord_enc.encoder_dict_

{'Neighborhood': {'CollgCr': 0,
  'ClearCr': 1,
  'BrkSide': 2,
  'Edwards': 3,
  'SWISU': 4,
  'Sawyer': 5,
  'Crawfor': 6,
  'NAmes': 7,
  'Mitchel': 8,
  'Timber': 9,
  'Gilbert': 10,
  'Somerst': 11,
  'MeadowV': 12,
  'OldTown': 13,
  'BrDale': 14,
  'NWAmes': 15,
  'NridgHt': 16,
  'SawyerW': 17,
  'NoRidge': 18,
  'IDOTRR': 19,
  'NPkVill': 20,
  'StoneBr': 21,
  'Blmngtn': 22,
  'Veenker': 23,
  'Blueste': 24},
 'Exterior1st': {'VinylSd': 0,
  'Wd Sdng': 1,
  'WdShing': 2,
  'HdBoard': 3,
  'MetalSd': 4,
  'AsphShn': 5,
  'BrkFace': 6,
  'Plywood': 7,
  'CemntBd': 8,
  'Stucco': 9,
  'BrkComm': 10,
  'AsbShng': 11,
  'ImStucc': 12,
  'CBlock': 13,
  'Stone': 14}}

In [12]:
from category_encoders.ordinal import OrdinalEncoder 
ord_enc= OrdinalEncoder(cols=["Neighborhood","Exterior1st"],
                        )
 
ord_enc.fit_transform(X_train)

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,1,1,VinylSd
682,2,2,Wd Sdng
960,3,2,Plywood
1384,4,3,Wd Shng
1100,5,2,Wd Sdng
...,...,...,...
763,19,1,VinylSd
835,6,1,HdBoard
1216,6,1,VinylSd
559,23,1,VinylSd


In [13]:
ord_enc.transform(X_test)

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
529,7,2,Stone
491,8,2,Wd Sdng
459,3,5,MetalSd
279,2,8,Plywood
655,15,4,ImStucc
...,...,...,...
271,2,8,Plywood
445,4,2,Wd Sdng
654,19,5,MetalSd
1280,1,1,VinylSd
