## **Handle Categorical Data**

In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
!pip install category_encoders



In [None]:
import category_encoders as ce

In [None]:
data = pd.DataFrame({
    'gender': ['Male', 'Female', 'Male', 'Female', 'Female'],
    'class': ['A', 'B', 'C', 'D', 'A'],
    'city': ['Delhi', 'Greece', 'Delhi', 'Delhi', 'Greece']
})

In [None]:
data

Unnamed: 0,gender,class,city
0,Male,A,Delhi
1,Female,B,Greece
2,Male,C,Delhi
3,Female,D,Delhi
4,Female,A,Greece


In [None]:
#one Hot Encoder
one_hot_encoder = ce.OneHotEncoder(cols = ['gender', 'city'])

#use fit_transform to fit one hot encoder on the data and transform it
new_data = one_hot_encoder.fit_transform(data)
new_data

Unnamed: 0,gender_1,gender_2,class,city_1,city_2
0,1,0,A,1,0
1,0,1,B,0,1
2,1,0,C,1,0
3,0,1,D,1,0
4,0,1,A,0,1


In [None]:
binary_encoder = ce.BinaryEncoder(cols = ['class'])

#transform data
data_binary = binary_encoder.fit_transform(new_data)

data_binary

Unnamed: 0,gender_1,gender_2,class_0,class_1,class_2,city_1,city_2
0,1,0,0,0,1,1,0
1,0,1,0,1,0,0,1
2,1,0,0,1,1,1,0
3,0,1,1,0,0,1,0
4,0,1,0,0,1,0,1


In [None]:
data = pd.DataFrame({
    'gender': ['Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
    'school': ['school1', 'school1', 'school2', 'school2', 'school3', 'school3'],
    'courses': ['AI', 'AI', 'Web', 'Flutter', 'Web', 'Flutter'],
    'city': ['Delhi', 'Greece', 'Delhi', 'Delhi', 'Greece', 'Delhi']
})

In [None]:
data

Unnamed: 0,gender,school,courses,city
0,Male,school1,AI,Delhi
1,Female,school1,AI,Greece
2,Male,school2,Web,Delhi
3,Female,school2,Flutter,Delhi
4,Female,school3,Web,Greece
5,Male,school3,Flutter,Delhi


In [None]:
#one hot encoding
one_hot_encoder = ce.OneHotEncoder(cols =['gender', 'city'])
encoded_data = one_hot_encoder.fit_transform(data)
encoded_data

Unnamed: 0,gender_1,gender_2,school,courses,city_1,city_2
0,1,0,school1,AI,1,0
1,0,1,school1,AI,0,1
2,1,0,school2,Web,1,0
3,0,1,school2,Flutter,1,0
4,0,1,school3,Web,0,1
5,1,0,school3,Flutter,1,0


In [None]:
#Binary Encoder
binary_encoder = ce.BinaryEncoder(cols = ['school', 'courses'])
#transform data
encoded_data = binary_encoder.fit_transform(encoded_data)

encoded_data

Unnamed: 0,gender_1,gender_2,school_0,school_1,courses_0,courses_1,city_1,city_2
0,1,0,0,1,0,1,1,0
1,0,1,0,1,0,1,0,1
2,1,0,1,0,1,0,1,0
3,0,1,1,0,1,1,1,0
4,0,1,1,1,1,0,0,1
5,1,0,1,1,1,1,1,0


In [None]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
#call labelEncoder from sklearn.preprocessing
le = LabelEncoder()

In [None]:
encoded_courses = le.fit_transform(data['courses'])

In [None]:
encoded_courses

array([0, 0, 2, 1, 2, 1])

In [None]:
data

Unnamed: 0,gender,school,courses,city
0,Male,school1,AI,Delhi
1,Female,school1,AI,Greece
2,Male,school2,Web,Delhi
3,Female,school2,Flutter,Delhi
4,Female,school3,Web,Greece
5,Male,school3,Flutter,Delhi


## **ordinal Encoder**

In [None]:
data = pd.DataFrame({
    'height': ['meduim', 'short', 'tall', 'meduim', 'short', 'tall', 'meduim', 'short', 'tall']
})
data

Unnamed: 0,height
0,meduim
1,short
2,tall
3,meduim
4,short
5,tall
6,meduim
7,short
8,tall


In [None]:
ordial_encoder = ce.OrdinalEncoder(cols = ['height'],
                                   return_df = True,
                                   mapping = [{'col': 'height',
                                            'mapping': {'None': 0,
                                            'tall': 1,
                                            'meduim': 2,
                                            'short': 3}}])

In [None]:
data['encoding'] = ordial_encoder.fit_transform(data)
data

Unnamed: 0,height,encoding
0,meduim,2
1,short,3
2,tall,1
3,meduim,2
4,short,3
5,tall,1
6,meduim,2
7,short,3
8,tall,1


In [None]:
data.drop('height', axis = 1, inplace = True)
data

Unnamed: 0,encoding
0,2
1,3
2,1
3,2
4,3
5,1
6,2
7,3
8,1


## **Target Encoding**

In [None]:
df = pd.DataFrame({
    'name' :['alex', 'john', 'mary','alex', 'john', 'mary'],
    'marks' : [100, 240, 307, 650, 170, 480]
})
df

Unnamed: 0,name,marks
0,alex,100
1,john,240
2,mary,307
3,alex,650
4,john,170
5,mary,480


In [None]:
#apply target encoder
target_encoder = ce.TargetEncoder(cols = 'name')

#fit and transform target encoder
df=target_encoder.fit_transform(df['name'], df['marks'])

In [None]:
df

Unnamed: 0,name
0,331.663479
1,307.548798
2,334.287723
3,331.663479
4,307.548798
5,334.287723


## **Hash Encoding**

In [None]:
data = pd.DataFrame({
    'color' : ['blue', 'blue', 'green', 'black', 'blue', 'yellow', 'black', 'green']
})
data

Unnamed: 0,color
0,blue
1,blue
2,green
3,black
4,blue
5,yellow
6,black
7,green


In [None]:
#hash encoding (md5 algorithm)
hash_encoder = ce.HashingEncoder(cols = ['color'], n_components = 5)

In [None]:
#fit and transform data
hash_encoder.fit_transform(data)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,1,0,0,0,0
5,0,0,0,0,1
6,0,0,0,1,0
7,0,0,0,0,1
