In [1]:
import category_encoders as ce
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

In [2]:
data = fetch_california_housing(as_frame=True)
df = data.frame

In [3]:
df['region'] = pd.cut(df['Longitude'], bins=3, labels=['West', 'Central', 'East'])
df['density'] = pd.cut(df['AveOccup'], bins=3, labels=['Low', 'Medium', 'High'])

In [4]:
# –í—ñ–¥–±–∏—Ä–∞—î–º–æ —Ç—ñ–ª—å–∫–∏ –∫–∞—Ç–µ–≥–æ—Ä—ñ–∞–ª—å–Ω—ñ —Ñ—ñ—á—ñ + —Ü—ñ–ª—å–æ–≤—É –∑–º—ñ–Ω–Ω—É
cat_columns = ['region', 'density']
df = df[cat_columns + ['MedHouseVal']].dropna()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['MedHouseVal']), 
                                                    df['MedHouseVal'], test_size=0.2, random_state=42)

### 1Ô∏è‚É£ One-Hot Encoding (OHE) ###

- OHE —Å—Ç–≤–æ—Ä—é—î –æ–∫—Ä–µ–º—É –∫–æ–ª–æ–Ω–∫—É –¥–ª—è –∫–æ–∂–Ω–æ–≥–æ —É–Ω—ñ–∫–∞–ª—å–Ω–æ–≥–æ –∑–Ω–∞—á–µ–Ω–Ω—è –∫–∞—Ç–µ–≥–æ—Ä—ñ–∞–ª—å–Ω–æ—ó –∑–º—ñ–Ω–Ω–æ—ó (dummy-–∫–æ–¥—É–≤–∞–Ω–Ω—è).
- –ü—ñ–¥—Ö–æ–¥–∏—Ç—å, –∫–æ–ª–∏ –∫—ñ–ª—å–∫—ñ—Å—Ç—å –∫–∞—Ç–µ–≥–æ—Ä—ñ–π –Ω–µ–≤–µ–ª–∏–∫–∞, —ñ–Ω–∞–∫—à–µ –º–∞—Ç—Ä–∏—Ü—è —Å—Ç–∞—î –∑–∞–Ω–∞–¥—Ç–æ –≤–µ–ª–∏–∫–æ—é.

In [6]:
ohe_encoder = ce.OneHotEncoder(cols=cat_columns, use_cat_names=True)
X_train_ohe = ohe_encoder.fit_transform(X_train)
X_test_ohe = ohe_encoder.transform(X_test)
X_train_ohe.head()

Unnamed: 0,region_West,region_Central,region_East,density_Low,density_Medium,density_High
14196,0,0,1,1,0,0
8267,0,1,0,1,0,0
17445,0,1,0,1,0,0
14265,0,0,1,1,0,0
2271,0,1,0,1,0,0


### 2Ô∏è‚É£ Target Encoding ###

- –ó–∞–º—ñ–Ω—é—î –∫–∞—Ç–µ–≥–æ—Ä—ñ—é —Å–µ—Ä–µ–¥–Ω—ñ–º –∑–Ω–∞—á–µ–Ω–Ω—è–º —Ü—ñ–ª—å–æ–≤–æ—ó –∑–º—ñ–Ω–Ω–æ—ó –¥–ª—è —Ü—ñ—î—ó –∫–∞—Ç–µ–≥–æ—Ä—ñ—ó.
- –í–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É—î—Ç—å—Å—è –¥–ª—è –º–æ–¥–µ–ª–µ–π, —á—É—Ç–ª–∏–≤–∏—Ö –¥–æ –ø–æ—Ä—è–¥–∫—É —á–∏—Å–µ–ª (–Ω–∞–ø—Ä–∏–∫–ª–∞–¥, –ª—ñ–Ω—ñ–π–Ω–∞ —Ä–µ–≥—Ä–µ—Å—ñ—è).
- –ú–æ–∂–µ –≤–∏–∫–ª–∏–∫–∞—Ç–∏ –≤–∏—Ç–æ–∫ –¥–∞–Ω–∏—Ö (data leakage), —Ç–æ–º—É –≤–∞–∂–ª–∏–≤–æ –ø—Ä–∞–≤–∏–ª—å–Ω–æ –∫—Ä–æ—Å-–≤–∞–ª—ñ–¥—É–≤–∞—Ç–∏.

In [7]:
target_encoder = ce.TargetEncoder(cols=cat_columns)
X_train_te = target_encoder.fit_transform(X_train, y_train)
X_test_te = target_encoder.transform(X_test)
X_train_te.head()

Unnamed: 0,region,density
14196,1.64741,2.071987
8267,2.137346,2.071987
17445,2.137346,2.071987
14265,1.64741,2.071987
2271,2.137346,2.071987


### 3Ô∏è‚É£ Ordinal Encoding ###

- –ó–∞–º—ñ–Ω—é—î –∫–æ–∂–Ω—É –∫–∞—Ç–µ–≥–æ—Ä—ñ—é —É–Ω—ñ–∫–∞–ª—å–Ω–∏–º —á–∏—Å–ª–æ–º (1, 2, 3...).
- –î–æ–±—Ä–µ –ø—Ä–∞—Ü—é—î –¥–ª—è –º–æ–¥–µ–ª–µ–π, —è–∫—ñ –º–æ–∂—É—Ç—å —Å–ø—Ä–∏–π–º–∞—Ç–∏ –ø–æ—Ä—è–¥–æ–∫ (–Ω–∞–ø—Ä–∏–∫–ª–∞–¥, –¥–µ—Ä–µ–≤–∞ —Ä—ñ—à–µ–Ω—å).
- –Ø–∫—â–æ –ø–æ—Ä—è–¥–æ–∫ –∫–∞—Ç–µ–≥–æ—Ä—ñ–π –Ω–µ–º–∞—î —Å–µ–Ω—Å—É, –º–æ–¥–µ–ª—å –º–æ–∂–µ –Ω–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ —ñ–Ω—Ç–µ—Ä–ø—Ä–µ—Ç—É–≤–∞—Ç–∏ –≤—ñ–¥—Å—Ç–∞–Ω—ñ –º—ñ–∂ –∑–Ω–∞—á–µ–Ω–Ω—è–º–∏.

In [8]:
ordinal_encoder = ce.OrdinalEncoder(cols=cat_columns)
X_train_oe = ordinal_encoder.fit_transform(X_train)
X_test_oe = ordinal_encoder.transform(X_test)
X_train_oe.head()

Unnamed: 0,region,density
14196,3,1
8267,2,1
17445,2,1
14265,3,1
2271,2,1


### 4Ô∏è‚É£ Binary Encoding ###

- –ö–æ–¥—É—î –∫–æ–∂–Ω—É –∫–∞—Ç–µ–≥–æ—Ä—ñ—é —É –¥–≤—ñ–π–∫–æ–≤–∏–π –∫–æ–¥, –∫–æ–∂–µ–Ω –±—ñ—Ç —Ä–æ–∑–º—ñ—â—É—î—Ç—å—Å—è —É –æ–∫—Ä–µ–º—ñ–π –∫–æ–ª–æ–Ω—Ü—ñ.
- –ó–º–µ–Ω—à—É—î –∫—ñ–ª—å–∫—ñ—Å—Ç—å –∫–æ–ª–æ–Ω–æ–∫ —É –ø–æ—Ä—ñ–≤–Ω—è–Ω–Ω—ñ –∑ One-Hot Encoding.
- –î–æ–±—Ä–µ –ø—Ä–∞—Ü—é—î –ø—Ä–∏ –≤–µ–ª–∏–∫—ñ–π –∫—ñ–ª—å–∫–æ—Å—Ç—ñ —É–Ω—ñ–∫–∞–ª—å–Ω–∏—Ö –∫–∞—Ç–µ–≥–æ—Ä—ñ–π.

In [9]:
binary_encoder = ce.BinaryEncoder(cols=cat_columns)
X_train_be = binary_encoder.fit_transform(X_train)
X_test_be = binary_encoder.transform(X_test)
X_train_be.head()

Unnamed: 0,region_0,region_1,density_0,density_1
14196,1,1,0,1
8267,1,0,0,1
17445,1,0,0,1
14265,1,1,0,1
2271,1,0,0,1


### 5Ô∏è‚É£ Hashing Encoder ###

- ‚úÖ –ö–æ–ª–∏ —É –¥–∞—Ç–∞—Å–µ—Ç—ñ –±–∞–≥–∞—Ç–æ —É–Ω—ñ–∫–∞–ª—å–Ω–∏—Ö –∫–∞—Ç–µ–≥–æ—Ä—ñ–π (–Ω–∞–ø—Ä–∏–∫–ª–∞–¥, ID –∫–æ—Ä–∏—Å—Ç—É–≤–∞—á—ñ–≤, –¥–æ–º–µ–Ω–∏ —Å–∞–π—Ç—ñ–≤).
- ‚úÖ –ö–æ–ª–∏ –Ω–µ–º–∞—î –ø–æ—Ç—Ä–µ–±–∏ —É –∑–≤–æ—Ä–æ—Ç–Ω–æ–º—É –¥–µ–∫–æ–¥—É–≤–∞–Ω–Ω—ñ (–±–æ —Ö–µ—à-—Ñ—É–Ω–∫—Ü—ñ—è –Ω–µ–æ–±–æ—Ä–æ—Ç–Ω–∞).
- ‚úÖ –ö–æ–ª–∏ –≤–∞–∂–ª–∏–≤–∏–π —Ä–æ–∑–º—ñ—Ä –¥–∞–Ω–∏—Ö (—Ü–µ –∫–æ–º–ø–∞–∫—Ç–Ω—ñ—à–µ –∑–∞ One-Hot Encoding).
- ‚úÖ –í–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É—î—Ç—å—Å—è —É –≥—Ä–∞–¥—ñ—î–Ω—Ç–Ω–æ–º—É –±—É—Å—Ç–∏–Ω–≥—É (XGBoost, CatBoost) —Ç–∞ –Ω–µ–π—Ä–æ–º–µ—Ä–µ–∂–∞—Ö.

- üö´ –ö–æ–ª–∏ –ù–ï –≤–∞—Ä—Ç–æ –≤–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É–≤–∞—Ç–∏:
- ‚ùå –Ø–∫—â–æ –ø–æ—Ç—Ä—ñ–±–Ω–æ —ñ–Ω—Ç–µ—Ä–ø—Ä–µ—Ç—É–≤–∞—Ç–∏ –∑–Ω–∞—á–µ–Ω–Ω—è (–±–æ –Ω–µ–º–æ–∂–ª–∏–≤–æ –≤—ñ–¥–Ω–æ–≤–∏—Ç–∏ –æ—Ä–∏–≥—ñ–Ω–∞–ª—å–Ω—É –∫–∞—Ç–µ–≥–æ—Ä—ñ—é).
- ‚ùå –Ø–∫—â–æ –¥–∞–Ω–∏—Ö –º–∞–ª–æ, –∞ –∫–æ–ª—ñ–∑—ñ—ó –º–æ–∂—É—Ç—å —Å—É—Ç—Ç—î–≤–æ —Å–ø–æ—Ç–≤–æ—Ä–∏—Ç–∏ —ñ–Ω—Ñ–æ—Ä–º–∞—Ü—ñ—é.

In [12]:
hashing_enc = ce.HashingEncoder(cols=['region', 'density'], n_components=8)
X_train_hsh = hashing_enc.fit_transform(X_train)
X_test_hsh = hashing_enc.transform(X_test)
X_train_hsh.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
14196,0,0,0,0,0,0,1,1
8267,0,0,1,0,0,0,0,1
17445,0,0,1,0,0,0,0,1
14265,0,0,0,0,0,0,1,1
2271,0,0,1,0,0,0,0,1
