# Handling Categorical Features

## 1. One Hot Encoding

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
data = pd.read_csv("titanic.csv",usecols = ['Sex'])
data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [26]:
pd.get_dummies(data,drop_first = True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


### One Hot Encoding for many categories in a feature

In [27]:
data = pd.read_csv("mercedes.csv",usecols = ["X0","X1","X2","X3","X4","X5","X6"])
data

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d
...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d
4205,j,o,t,d,d,aa,h
4206,ak,v,r,a,d,aa,g
4207,al,r,e,f,d,aa,l


In [28]:
for i in data.columns:
    print(len(data[i].unique()))

47
27
44
7
4
29
12


In [29]:
top_ten = data['X1'].value_counts().sort_values(ascending=False).head(10).index
top_ten = (top_ten).to_list()
top_ten

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [30]:
for cat in top_ten:
    data[cat] = np.where(data["X1"]==cat,1,0)

In [31]:
top_ten.append('X1')
data[top_ten]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


## 2. Ordinal Encoding

In [32]:
import datetime

In [34]:
today = datetime.datetime.today()

In [36]:
l = [today - datetime.timedelta(x) for x in range(1,16)]
l

[datetime.datetime(2021, 10, 27, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 26, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 25, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 24, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 23, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 22, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 21, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 20, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 19, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 18, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 17, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 16, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 15, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 14, 18, 26, 39, 11988),
 datetime.datetime(2021, 10, 13, 18, 26, 39, 11988)]

In [37]:
data = pd.DataFrame(l,columns = ['Day'])
data

Unnamed: 0,Day
0,2021-10-27 18:26:39.011988
1,2021-10-26 18:26:39.011988
2,2021-10-25 18:26:39.011988
3,2021-10-24 18:26:39.011988
4,2021-10-23 18:26:39.011988
5,2021-10-22 18:26:39.011988
6,2021-10-21 18:26:39.011988
7,2021-10-20 18:26:39.011988
8,2021-10-19 18:26:39.011988
9,2021-10-18 18:26:39.011988


In [43]:
data['weekday']=data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-10-27 18:26:39.011988,Wednesday
1,2021-10-26 18:26:39.011988,Tuesday
2,2021-10-25 18:26:39.011988,Monday
3,2021-10-24 18:26:39.011988,Sunday
4,2021-10-23 18:26:39.011988,Saturday


In [44]:
dict = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

data['Encoding'] = data['weekday'].map(dict)
data

Unnamed: 0,Day,weekday,Encoding
0,2021-10-27 18:26:39.011988,Wednesday,3
1,2021-10-26 18:26:39.011988,Tuesday,2
2,2021-10-25 18:26:39.011988,Monday,1
3,2021-10-24 18:26:39.011988,Sunday,7
4,2021-10-23 18:26:39.011988,Saturday,6
5,2021-10-22 18:26:39.011988,Friday,5
6,2021-10-21 18:26:39.011988,Thursday,4
7,2021-10-20 18:26:39.011988,Wednesday,3
8,2021-10-19 18:26:39.011988,Tuesday,2
9,2021-10-18 18:26:39.011988,Monday,1


## 3. Count or Frequency Encoding

In [46]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [47]:
columns=[1,3,5,6,7,8,9,13]
train_set=train_set[columns]
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [48]:
for f in train_set.columns:
    print(f,":",len(train_set[f].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [52]:
c_map = train_set['Country'].value_counts().to_dict()
c_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' Greece': 29,
 ' France': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [53]:
train_set['Country'] = train_set['Country'].map(c_map)
train_set

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,29170
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,29170
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,29170
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,29170
