In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
data = pd.read_csv(
    "houseprice.csv",
    usecols=["Neighborhood", "Exterior1st", "Exterior2nd", "SalePrice"],
)

data.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [2]:
count_map=data["Neighborhood"].value_counts().to_dict()
count_map

{'NAmes': 225,
 'CollgCr': 150,
 'OldTown': 113,
 'Edwards': 100,
 'Somerst': 86,
 'Gilbert': 79,
 'NridgHt': 77,
 'Sawyer': 74,
 'NWAmes': 73,
 'SawyerW': 59,
 'BrkSide': 58,
 'Crawfor': 51,
 'Mitchel': 49,
 'NoRidge': 41,
 'Timber': 38,
 'IDOTRR': 37,
 'ClearCr': 28,
 'StoneBr': 25,
 'SWISU': 25,
 'MeadowV': 17,
 'Blmngtn': 17,
 'BrDale': 16,
 'Veenker': 11,
 'NPkVill': 9,
 'Blueste': 2}

In [4]:
data["Neighborhood"].map(count_map)

0       150
1        11
2       150
3        51
4        41
       ... 
1455     79
1456     73
1457     51
1458    225
1459    100
Name: Neighborhood, Length: 1460, dtype: int64

In [5]:
frequency = (data["Neighborhood"].value_counts()/len(data)).to_dict()
frequency

{'NAmes': 0.1541095890410959,
 'CollgCr': 0.10273972602739725,
 'OldTown': 0.0773972602739726,
 'Edwards': 0.0684931506849315,
 'Somerst': 0.0589041095890411,
 'Gilbert': 0.05410958904109589,
 'NridgHt': 0.05273972602739726,
 'Sawyer': 0.050684931506849315,
 'NWAmes': 0.05,
 'SawyerW': 0.04041095890410959,
 'BrkSide': 0.03972602739726028,
 'Crawfor': 0.03493150684931507,
 'Mitchel': 0.03356164383561644,
 'NoRidge': 0.028082191780821917,
 'Timber': 0.026027397260273973,
 'IDOTRR': 0.025342465753424658,
 'ClearCr': 0.019178082191780823,
 'StoneBr': 0.017123287671232876,
 'SWISU': 0.017123287671232876,
 'MeadowV': 0.011643835616438357,
 'Blmngtn': 0.011643835616438357,
 'BrDale': 0.010958904109589041,
 'Veenker': 0.007534246575342466,
 'NPkVill': 0.0061643835616438354,
 'Blueste': 0.0013698630136986301}

In [6]:
data["Neighborhood"].map(frequency)

0       0.102740
1       0.007534
2       0.102740
3       0.034932
4       0.028082
          ...   
1455    0.054110
1456    0.050000
1457    0.034932
1458    0.154110
1459    0.068493
Name: Neighborhood, Length: 1460, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # predictors
    data["SalePrice"],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0,
)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [8]:
from feature_engine.encoding import CountFrequencyEncoder
ct_enc = CountFrequencyEncoder(encoding_method="count",variables=["Neighborhood"])

ct_enc.fit_transform(X_train)

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,105,VinylSd,VinylSd
682,24,Wd Sdng,Wd Sdng
960,41,Wd Sdng,Plywood
1384,71,WdShing,Wd Shng
1100,18,Wd Sdng,Wd Sdng
...,...,...,...
763,30,VinylSd,VinylSd
835,61,VinylSd,HdBoard
1216,61,VinylSd,VinylSd
559,12,VinylSd,VinylSd


In [10]:
from category_encoders.count import CountEncoder
cat_count = CountEncoder(cols=["Neighborhood"],
                         normalize=False)
cat_count.fit(X_train)

In [12]:
cat_count.transform(X_train)

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,105,VinylSd,VinylSd
682,24,Wd Sdng,Wd Sdng
960,41,Wd Sdng,Plywood
1384,71,WdShing,Wd Shng
1100,18,Wd Sdng,Wd Sdng
...,...,...,...
763,30,VinylSd,VinylSd
835,61,VinylSd,HdBoard
1216,61,VinylSd,VinylSd
559,12,VinylSd,VinylSd
