## Handling categorical features 

### One Hot Encoding

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('titanic.csv', usecols=['Sex'])

In [3]:
df.head()


Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
df = pd.get_dummies(df, drop_first=True).astype(int)

In [5]:
df.head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df1 = pd.read_csv('titanic.csv', usecols=['Embarked'])

In [7]:
df1 = pd.get_dummies(df1, drop_first=True).astype(int)

In [8]:
df1.head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [9]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [10]:
for cat in df.columns:
    print(len(df[cat].value_counts()))
    
## for cat in df.columns:
##     print(len(df[cat].unique()))

47
27
44
7
4
29
12


In [11]:
df.X2.isnull().sum()

0

In [12]:
df.X2.value_counts().head(10).index

Index(['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'], dtype='object', name='X2')

In [13]:
list_10 = df.X2.value_counts().head(10).index
list_10 = list(list_10)

In [14]:
list_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [15]:
for categories in list_10:
    df[categories]=np.where(df['X2']==categories,1,0)

In [16]:
list_10.append('X2')

In [17]:
df[list_10]

Unnamed: 0,as,ae,ai,m,ak,r,n,s,f,e,X2
0,0,0,0,0,0,0,0,0,0,0,at
1,0,0,0,0,0,0,0,0,0,0,av
2,0,0,0,0,0,0,1,0,0,0,n
3,0,0,0,0,0,0,1,0,0,0,n
4,0,0,0,0,0,0,1,0,0,0,n
...,...,...,...,...,...,...,...,...,...,...,...
4204,1,0,0,0,0,0,0,0,0,0,as
4205,0,0,0,0,0,0,0,0,0,0,t
4206,0,0,0,0,0,1,0,0,0,0,r
4207,0,0,0,0,0,0,0,0,0,1,e


### Ordinal Number Encoding

In [18]:
import datetime

In [19]:
today_date = datetime.datetime.today()

In [20]:
today_date

datetime.datetime(2024, 5, 11, 22, 59, 25, 55362)

In [21]:
today_date-datetime.timedelta(3)

datetime.datetime(2024, 5, 8, 22, 59, 25, 55362)

In [22]:
days = [today_date-datetime.timedelta(x) for x in range(0, 15)]

In [23]:
days

[datetime.datetime(2024, 5, 11, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 10, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 9, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 8, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 7, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 6, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 5, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 4, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 3, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 2, 22, 59, 25, 55362),
 datetime.datetime(2024, 5, 1, 22, 59, 25, 55362),
 datetime.datetime(2024, 4, 30, 22, 59, 25, 55362),
 datetime.datetime(2024, 4, 29, 22, 59, 25, 55362),
 datetime.datetime(2024, 4, 28, 22, 59, 25, 55362),
 datetime.datetime(2024, 4, 27, 22, 59, 25, 55362)]

In [24]:
df = pd.DataFrame(days, columns=['Day'])

In [25]:
df.head()

Unnamed: 0,Day
0,2024-05-11 22:59:25.055362
1,2024-05-10 22:59:25.055362
2,2024-05-09 22:59:25.055362
3,2024-05-08 22:59:25.055362
4,2024-05-07 22:59:25.055362


In [26]:
df['WeekDay'] = df['Day'].dt.day_name()

In [27]:
df.head()

Unnamed: 0,Day,WeekDay
0,2024-05-11 22:59:25.055362,Saturday
1,2024-05-10 22:59:25.055362,Friday
2,2024-05-09 22:59:25.055362,Thursday
3,2024-05-08 22:59:25.055362,Wednesday
4,2024-05-07 22:59:25.055362,Tuesday


In [28]:
df.WeekDay.unique()

array(['Saturday', 'Friday', 'Thursday', 'Wednesday', 'Tuesday', 'Monday',
       'Sunday'], dtype=object)

In [29]:
encoded_dictionary={'Monday':1,
            'Tuesday':2,
            'Wednesday':3,
            'Thursday':4,
            'Friday':5,
            'Saturday':6,
            'Sunday':7
}

In [30]:
df['Weekday_ordinal_encod'] = df['WeekDay'].map(encoded_dictionary)

In [31]:
df.head()

Unnamed: 0,Day,WeekDay,Weekday_ordinal_encod
0,2024-05-11 22:59:25.055362,Saturday,6
1,2024-05-10 22:59:25.055362,Friday,5
2,2024-05-09 22:59:25.055362,Thursday,4
3,2024-05-08 22:59:25.055362,Wednesday,3
4,2024-05-07 22:59:25.055362,Tuesday,2


### Count Or Frequency Encoding

In [32]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [33]:
df = df[['1','3','5','6','7','8','9','13']]
df.head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [34]:
df.columns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "country"]

In [35]:
df.head()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [36]:
def impute_frquency(df, variable):
    var_freq = df[variable].value_counts().to_dict()
    df[variable] = df[variable].map(var_freq)

In [37]:
for feature in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "country"]:
    impute_frquency(df, feature)

In [38]:
df.dropna(inplace=True)

In [39]:
df.head()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,country
0,1298.0,5355.0,10683.0,3770.0,8305.0,27816.0,21790.0,29170.0
1,2541.0,5355.0,14976.0,4066.0,13193.0,27816.0,21790.0,29170.0
2,22696.0,10501.0,4443.0,1370.0,8305.0,27816.0,21790.0,29170.0
3,22696.0,1175.0,14976.0,1370.0,13193.0,3124.0,21790.0,29170.0
4,22696.0,5355.0,14976.0,4140.0,1568.0,3124.0,10771.0,95.0


In [40]:
df.isnull().sum()

workclass         0
education         0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
country           0
dtype: int64

##### Advantages
##### Easy To Use
##### Not increasing feature space
##### Disadvantages
##### It will provide same weight if the frequencies are same

### Target Guided Ordinal Number Encoding

In [41]:
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [42]:
df['Cabin'] = df['Cabin'].fillna('Missing').str[0]

In [43]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [44]:
### To check the uniquenes and the look at for the mean

In [45]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [46]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [47]:
ord_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ord_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [48]:
ordinal_labels = {k:i for i, k in enumerate(ord_labels, 0)}
ordinal_labels

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [49]:
df['tgt_ord_encod'] = df['Cabin'].map(ordinal_labels)

In [50]:
df.head()

Unnamed: 0,Survived,Cabin,tgt_ord_encod
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [51]:
df.head()

Unnamed: 0,Survived,Cabin,tgt_ord_encod
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


In [52]:
mean_ord = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [53]:
df['mean_encoding'] = df['Cabin'].map(mean_ord)

In [54]:
df.head()

Unnamed: 0,Survived,Cabin,tgt_ord_encod,mean_encoding
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


### Probability Ratio Encoding

In [55]:
df = pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [56]:
df['Cabin'].fillna('Missing', inplace=True)

In [57]:
df['Cabin'] = df['Cabin'].str[0]

In [58]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [60]:
df_ = df.groupby(['Cabin'])['Survived'].mean()

In [61]:
df_prob = pd.DataFrame(df_)
df_prob.head()

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75


In [62]:
df_prob['Died'] = 1 - df_prob['Survived']

In [64]:
df_prob.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [65]:
df_prob['probability_ratio'] = df_prob['Survived'] / df_prob['Died']

In [67]:
df_prob.head()

Unnamed: 0_level_0,Survived,Died,probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [68]:
prob_encode = df_prob['probability_ratio'].to_dict()

In [69]:
prob_encode

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [70]:
df['Cabin_prob_encode'] = df['Cabin'].map(prob_encode)

In [71]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_prob_encode
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
