### handle categorical features

##### one hot encoding

In [2]:
import pandas as pd
import numpy as np

In [3]:
df= pd.read_csv('titanic.csv',usecols=['sex'])
df.head()

Unnamed: 0,sex
0,male
1,female
2,female
3,female
4,male


In [4]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,sex_male
0,1
1,0
2,0
3,0
4,1


In [5]:
df= pd.read_csv('titanic.csv',usecols=['embarked'])
df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [6]:
df.dropna(inplace=True)

In [7]:
df['embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [8]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


##### one hot encoding with many categories in a feature

In [9]:
df=pd.read_csv("mercedes_benz/train.csv",usecols=['X0','X1','X2','X3','X4','X5','X6'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [10]:
len(df['X0'].unique())

47

In [11]:
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


In [12]:
df['X0'].value_counts().sort_values(ascending=False).head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [13]:
lst_10 = df['X0'].value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)

In [14]:
lst_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [15]:
for categories in lst_10:
    df[categories]=np.where(df['X0']==categories,1,0)

In [16]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,z,ak,y,ay,t,x,o,f,n,w
0,k,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,0,0,0,0,0,0


In [17]:
lst_10.append('X0')

In [18]:
df[lst_10]

Unnamed: 0,z,ak,y,ay,t,x,o,f,n,w,X0
0,0,0,0,0,0,0,0,0,0,0,k
1,0,0,0,0,0,0,0,0,0,0,k
2,0,0,0,0,0,0,0,0,0,0,az
3,0,0,0,0,0,0,0,0,0,0,az
4,0,0,0,0,0,0,0,0,0,0,az
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,ak
4205,0,0,0,0,0,0,0,0,0,0,j
4206,0,1,0,0,0,0,0,0,0,0,ak
4207,0,0,0,0,0,0,0,0,0,0,al


#### ordinal Encoding

In [19]:
import datetime

In [20]:
today_time = datetime.datetime.today()

In [21]:
today_time

datetime.datetime(2023, 2, 27, 1, 20, 10, 903948)

In [22]:
days =[today_time-datetime.timedelta(x) for x in range(0,15)]

In [23]:
df = pd.DataFrame(days)
df.columns=['Day']

In [24]:
df.head()

Unnamed: 0,Day
0,2023-02-27 01:20:10.903948
1,2023-02-26 01:20:10.903948
2,2023-02-25 01:20:10.903948
3,2023-02-24 01:20:10.903948
4,2023-02-23 01:20:10.903948


In [25]:
df['Weekday']=df['Day'].dt.strftime('%A')
df.head()

Unnamed: 0,Day,Weekday
0,2023-02-27 01:20:10.903948,Monday
1,2023-02-26 01:20:10.903948,Sunday
2,2023-02-25 01:20:10.903948,Saturday
3,2023-02-24 01:20:10.903948,Friday
4,2023-02-23 01:20:10.903948,Thursday


In [26]:

dict = {'Sunday':1,'Saturday':2,'Friday':3,'Thursday':4,'Wednesday':5,'Tuesday':6,'Monday':7}

In [27]:
df['day_ordinal_map'] = df['Weekday'].map(dict)

In [28]:
df.head()

Unnamed: 0,Day,Weekday,day_ordinal_map
0,2023-02-27 01:20:10.903948,Monday,7
1,2023-02-26 01:20:10.903948,Sunday,1
2,2023-02-25 01:20:10.903948,Saturday,2
3,2023-02-24 01:20:10.903948,Friday,3
4,2023-02-23 01:20:10.903948,Thursday,4


#### count or frequency encoding

In [29]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None,index_col=None) 
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [30]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [31]:
columns = [x for x in train_set.columns if train_set[x].dtypes=='O']
columns

[1, 3, 5, 6, 7, 8, 9, 13, 14]

In [32]:
train_set = train_set[columns]

In [33]:
train_set.head()

Unnamed: 0,1,3,5,6,7,8,9,13,14
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [34]:
train_set = train_set.drop([14],axis=1)

In [35]:
train_set.head(20)

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States


In [36]:
train_set.columns=['employement','degree','status','designation','family_job','race','sex','country']

In [37]:
train_set.head()

Unnamed: 0,employement,degree,status,designation,family_job,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [38]:
for feature in train_set.columns:
    print(f"{feature} : {len(train_set[feature].unique())} labels")

employement : 9 labels
degree : 16 labels
status : 7 labels
designation : 15 labels
family_job : 6 labels
race : 5 labels
sex : 2 labels
country : 42 labels


In [39]:
country_map = train_set['country'].value_counts().to_dict()

In [40]:
train_set['country']=train_set['country'].map(country_map)

In [41]:
train_set.head(20)

Unnamed: 0,employement,degree,status,designation,family_job,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


#### Advantages
1. Easy to use
2. not increasing feature space

#### disadvantages
1. it will provide same weight if the frequency are same


#### Target guided ordinal Encoding
1. ordering the labels according to the target
2. Replace the labels by the probability of being 1 or 0

In [45]:
df = pd.read_csv('titanic.csv',usecols=['survived','deck'])
df.head()

Unnamed: 0,survived,deck
0,0,
1,1,C
2,1,
3,1,C
4,0,


In [46]:
df['deck'].fillna('Missing',inplace=True)

In [48]:
df['deck']=df['deck'].astype(str).str[0]

In [49]:
df.head()

Unnamed: 0,survived,deck
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [50]:
df.deck.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [51]:
df.groupby(['deck'])['survived'].mean()

deck
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299419
Name: survived, dtype: float64

In [53]:
ordinal_labels = df.groupby(['deck'])['survived'].mean().sort_values().index
ordinal_labels


Index(['M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='deck')

In [56]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'M': 0, 'A': 1, 'G': 2, 'C': 3, 'F': 4, 'B': 5, 'E': 6, 'D': 7}

In [57]:
df['deck_ordinal_labels'] = df['deck'].map(ordinal_labels2)
df.head()

Unnamed: 0,survived,deck,deck_ordinal_labels
0,0,M,0
1,1,C,3
2,1,M,0
3,1,C,3
4,0,M,0


#### Mean Encoding

In [59]:
ordinal_mean = df.groupby(['deck'])['survived'].mean().to_dict()

In [60]:
df['deck_mean_labels'] = df['deck'].map(ordinal_mean)
df.head()

Unnamed: 0,survived,deck,deck_ordinal_labels,deck_mean_labels
0,0,M,0,0.299419
1,1,C,3,0.59322
2,1,M,0,0.299419
3,1,C,3,0.59322
4,0,M,0,0.299419


#### probability ratio Encoding
1. Probability of Survived based on Cabin--- Categorical Feature
2. Probability of Not Survived---1-pr(Survived)
3. pr(Survived)/pr(Not Survived)
4. Dictonary to map cabin with probability
5. replace with the categorical feature

In [61]:
df = pd.read_csv('titanic.csv',usecols=['survived','deck'])
df.head()

Unnamed: 0,survived,deck
0,0,
1,1,C
2,1,
3,1,C
4,0,


In [62]:
df['deck'].fillna('Missing',inplace=True)

In [63]:
df.head()

Unnamed: 0,survived,deck
0,0,Missing
1,1,C
2,1,Missing
3,1,C
4,0,Missing


In [64]:
df['deck']=df['deck'].astype(str).str[0]

In [65]:
df.deck.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [67]:
prob_df = df.groupby(['deck'])['survived'].mean()

In [68]:
prob_df = pd.DataFrame(prob_df)

In [69]:
prob_df

Unnamed: 0_level_0,survived
deck,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299419


In [70]:
prob_df['dead'] = 1-prob_df['survived']

In [71]:
prob_df.head()

Unnamed: 0_level_0,survived,dead
deck,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [72]:
prob_df['Probability_ratio'] = prob_df['survived']/prob_df['dead']

In [73]:
prob_df.head()

Unnamed: 0_level_0,survived,dead,Probability_ratio
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [74]:
probability_encoded = prob_df['Probability_ratio'].to_dict()

In [75]:
df['deck_encoded'] = df['deck'].map(probability_encoded)

In [76]:
df.head()

Unnamed: 0,survived,deck,deck_encoded
0,0,M,0.427386
1,1,C,1.458333
2,1,M,0.427386
3,1,C,1.458333
4,0,M,0.427386
