### Handling categorical data

1. One Hot Encoding.
2. Onehotencoding with many categories in a feature.
3. Custom Binary Encoding.
4. Label Encoding.

In [61]:
import numpy as np
import pandas as pd
SEED = 32

In [62]:
lst = list('abcdefg')
h, v, n = 5, 4, len(lst)
np.random.seed(SEED) 
idx = np.random.randint(n, size=(h, v))
feats = [f'col{i}' for i in range(v)]
df = pd.DataFrame(np.array(lst)[idx], columns=feats)
df

Unnamed: 0,col0,col1,col2,col3
0,d,f,g,e
1,g,a,d,e
2,g,b,d,f
3,c,b,e,d
4,b,d,b,c


In [63]:
pd.get_dummies(df)

Unnamed: 0,col0_b,col0_c,col0_d,col0_g,col1_a,col1_b,col1_d,col1_f,col2_b,col2_d,col2_e,col2_g,col3_c,col3_d,col3_e,col3_f
0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1
3,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0
4,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0


In [64]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
dt_ohe = ohe.fit_transform(df)
dt_ohe

array([[0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.]])

#### Onehotencoding with many categories in a feature

In [72]:
labels = list(map(chr, range(65, 120)))
labels = pd.Series(labels)

np.random.seed(SEED)
idx = np.random.randint(len(labels), size=30)

feat = pd.Series(labels[idx])
feat

23    X
43    l
5     F
54    w
24    Y
19    T
7     H
25    Z
3     D
37    f
42    k
9     J
4     E
11    L
17    R
3     D
1     B
34    c
35    d
24    Y
42    k
18    S
10    K
20    U
50    s
36    e
36    e
5     F
38    g
13    N
dtype: object

In [73]:
top_10 = feat.value_counts().sort_values(ascending=False).nlargest(10).index
top_10

Index(['F', 'e', 'D', 'k', 'Y', 'R', 'w', 'T', 'H', 'Z'], dtype='object')

In [74]:
dft = pd.DataFrame()
for label in top_10:
    dft[label] = np.where(label == feat, 1, 0)
    
dft    

Unnamed: 0,F,e,D,k,Y,R,w,T,H,Z
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,0,0,0,1
8,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0


#### Custom Binary Encoding
For the sake of discussion, maybe all we care about is whether or not the particular feature is 'a' or not.

In [68]:
feat = 'col0'
dft = df.copy()
dft[feat] = np.where(df[feat].str.contains('g'), 1, 0)
display(df)
dft

Unnamed: 0,col0,col1,col2,col3
0,d,f,g,e
1,g,a,d,e
2,g,b,d,f
3,c,b,e,d
4,b,d,b,c


Unnamed: 0,col0,col1,col2,col3
0,0,f,g,e
1,1,a,d,e
2,1,b,d,f
3,0,b,e,d
4,0,d,b,c


#### LabelEncoding

In [69]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [70]:
df1 = df.copy()

for feat in feats:
    le = LabelEncoder()
    df1[feat] = le.fit_transform(df1[feat])
df1

Unnamed: 0,col0,col1,col2,col3
0,2,3,3,2
1,3,0,1,2
2,3,1,1,3
3,1,1,2,1
4,0,2,0,0


In [71]:
df2 = df.copy()

oe = OrdinalEncoder()
df2 = pd.DataFrame(oe.fit_transform(df), columns=feats)
np.all(df1 == df2)

True