In [3]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [4]:
import pandas as pd
import numpy as np
import category_encoders as ce

In [11]:
data = {
    'pincode': [
        110001, 110002, 110003, 110004, 110005, 110006, 110007, 110008, 110009, 110010,
        400001, 400002, 400003, 400004, 400005, 400006, 400007, 400008, 400009, 400010,
        700001, 700002, 700003, 700004, 700005, 700006, 700007, 700008, 700009, 700010
    ],
    'state': [
        'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi', 'Delhi',
        'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai', 'Mumbai',
        'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata', 'Kolkata'
    ]
}

df = pd.DataFrame(data)
print(df)

    pincode    state
0    110001    Delhi
1    110002    Delhi
2    110003    Delhi
3    110004    Delhi
4    110005    Delhi
5    110006    Delhi
6    110007    Delhi
7    110008    Delhi
8    110009    Delhi
9    110010    Delhi
10   400001   Mumbai
11   400002   Mumbai
12   400003   Mumbai
13   400004   Mumbai
14   400005   Mumbai
15   400006   Mumbai
16   400007   Mumbai
17   400008   Mumbai
18   400009   Mumbai
19   400010   Mumbai
20   700001  Kolkata
21   700002  Kolkata
22   700003  Kolkata
23   700004  Kolkata
24   700005  Kolkata
25   700006  Kolkata
26   700007  Kolkata
27   700008  Kolkata
28   700009  Kolkata
29   700010  Kolkata


In [12]:
# Frequency Encoding

# Frequency encoding is a method used to encode categorical variables by replacing each category with its frequency (or count) in the dataset.
frequency_map = df['state'].value_counts().to_dict()
df['state_freq_encoded'] = df['state'].map(frequency_map)
print(df)

    pincode    state  state_freq_encoded
0    110001    Delhi                  10
1    110002    Delhi                  10
2    110003    Delhi                  10
3    110004    Delhi                  10
4    110005    Delhi                  10
5    110006    Delhi                  10
6    110007    Delhi                  10
7    110008    Delhi                  10
8    110009    Delhi                  10
9    110010    Delhi                  10
10   400001   Mumbai                  10
11   400002   Mumbai                  10
12   400003   Mumbai                  10
13   400004   Mumbai                  10
14   400005   Mumbai                  10
15   400006   Mumbai                  10
16   400007   Mumbai                  10
17   400008   Mumbai                  10
18   400009   Mumbai                  10
19   400010   Mumbai                  10
20   700001  Kolkata                  10
21   700002  Kolkata                  10
22   700003  Kolkata                  10
23   700004  Kol

In [13]:
# Binary Encoding
encoder = ce.BinaryEncoder(cols='state')
df_encoded = encoder.fit_transform(df['state'])
df = pd.concat([df, df_encoded], axis=1)
print(df)

    pincode    state  state_freq_encoded  state_0  state_1
0    110001    Delhi                  10        0        1
1    110002    Delhi                  10        0        1
2    110003    Delhi                  10        0        1
3    110004    Delhi                  10        0        1
4    110005    Delhi                  10        0        1
5    110006    Delhi                  10        0        1
6    110007    Delhi                  10        0        1
7    110008    Delhi                  10        0        1
8    110009    Delhi                  10        0        1
9    110010    Delhi                  10        0        1
10   400001   Mumbai                  10        1        0
11   400002   Mumbai                  10        1        0
12   400003   Mumbai                  10        1        0
13   400004   Mumbai                  10        1        0
14   400005   Mumbai                  10        1        0
15   400006   Mumbai                  10        1       

In [9]:
# Target Encoding

data = {
    'category': ['A', 'B', 'C', 'A', 'B', 'B', 'C', 'A', 'A', 'B'],
    'target': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Calculate mean target value for each category
mean_target = df.groupby('category')['target'].mean()
print("\nMean target value for each category:")
print(mean_target)

# Replace categories with their mean target values
df['category_target_encoded'] = df['category'].map(mean_target)
print("\nDataFrame after Target Encoding:")
print(df)


Original DataFrame:
  category  target
0        A       1
1        B       0
2        C       1
3        A       0
4        B       1
5        B       0
6        C       1
7        A       0
8        A       1
9        B       0

Mean target value for each category:
category
A    0.50
B    0.25
C    1.00
Name: target, dtype: float64

DataFrame after Target Encoding:
  category  target  category_target_encoded
0        A       1                     0.50
1        B       0                     0.25
2        C       1                     1.00
3        A       0                     0.50
4        B       1                     0.25
5        B       0                     0.25
6        C       1                     1.00
7        A       0                     0.50
8        A       1                     0.50
9        B       0                     0.25
