# 1. One Hot Encoding

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic_train_dataset.csv', usecols = ['Sex'])

In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Sex     891 non-null    object
dtypes: object(1)
memory usage: 7.1+ KB


In [5]:
#finding out no. of unique values in the 'Sex' categorical feature 
df['Sex'].unique()

array(['male', 'female'], dtype=object)

- Hence this categorical feature is having 2 categories of data.

In [6]:
pd.get_dummies(df)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


- Here one feature is enough to describe the data. Hence we will drop one feature.

In [7]:
pd.get_dummies(df, drop_first = True)

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [8]:
#exploring another categorical feature
df = pd.read_csv('titanic_train_dataset.csv', usecols = ['Embarked'])

In [9]:
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Embarked  889 non-null    object
dtypes: object(1)
memory usage: 7.1+ KB


In [11]:
#finding out no. of unique values in the categorical feature 'Embarked'
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

- There are 3 types of unique value & nan in 'Embarked' feature. Hence we are going to drop the nan values.

In [12]:
df.dropna(inplace = True)

In [13]:
df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [14]:
pd.get_dummies(df)

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


- Here 2 features are enough to describe the data. Hence we can drop one feature.

In [15]:
pd.get_dummies(df, drop_first = True)

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


- One disadvantage of One hot encoding technique is more & more features are getting added to the data frame.
- If there are k unique values in a categorical feature, then using One hot encoding we should create k-1 features.

__What if you have many many categories, can you perform one hot encoding for those?__

### One hot encoding with many categories in a feature

In [16]:
df = pd.read_csv('Mercedes_Benz_Greener_Manufacturing.csv', usecols = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [17]:
#finding out unique values/ categories in feature 'X0'
df['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [18]:
len(df['X0'].unique())

47

- So there are 47 unique categories in the feature 'X0'. If we apply one hot encoding, 47 new features will get added to the dataframe.

In [19]:
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


- So you can see the no. of categories in each feature. So if you apply one hot encoding, then there will be a huge problem. We have to solve this type of problem through __Ensemble Selection Technique__.
- 10 most frequent categories in each feature is selected & on those categories one hot encoding is applied.

In [20]:
df.X1.value_counts()

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
q       3
d       3
ab      3
Name: X1, dtype: int64

- Here you can see after 'w' there are very less number of category ocuurring in the feature. Top 10 are occupying most of the categories.

In [21]:
df.X1.value_counts().sort_values(ascending = False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

- So we will just take these top 10 & apply one hot encode on these. We will not apply one hot encode on other categories of this feature.

In [22]:
lst_10 = df.X1.value_counts().sort_values(ascending = False).head(10).index
lst_10 = list(lst_10)

In [23]:
#so the 10 most frequent categoeies of X1 feature are
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [25]:
import numpy as np

for categories in lst_10:
    df[categories] = np.where(df['X1'] == categories, 1, 0)

In [26]:
lst_10.append('X1')

In [27]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


In [28]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0
4205,j,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1
4206,ak,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0
4207,al,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0


- So finally we increased the dimension of the datset by 10.

# 2. Ordinal Number Encoding
- We can define ordinal number encoding as, where the categories can be ordered in such a way where you can assign ranks.

In [29]:
import datetime

today_datetime = datetime.datetime.today()
today_datetime

datetime.datetime(2021, 6, 25, 7, 57, 7, 327145)

In [30]:
today_datetime - datetime.timedelta(2)

datetime.datetime(2021, 6, 23, 7, 57, 7, 327145)

In [32]:
#getting last 15 days data including today
days = [today_datetime - datetime.timedelta(x) for x in range(0, 15)]
days

[datetime.datetime(2021, 6, 25, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 24, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 23, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 22, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 21, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 20, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 19, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 18, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 17, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 16, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 15, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 14, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 13, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 12, 7, 57, 7, 327145),
 datetime.datetime(2021, 6, 11, 7, 57, 7, 327145)]

In [34]:
import pandas as pd

#creating data frame of the last 15 days data that we have
df = pd.DataFrame(days)
df

Unnamed: 0,0
0,2021-06-25 07:57:07.327145
1,2021-06-24 07:57:07.327145
2,2021-06-23 07:57:07.327145
3,2021-06-22 07:57:07.327145
4,2021-06-21 07:57:07.327145
5,2021-06-20 07:57:07.327145
6,2021-06-19 07:57:07.327145
7,2021-06-18 07:57:07.327145
8,2021-06-17 07:57:07.327145
9,2021-06-16 07:57:07.327145


In [37]:
#providng column name
df.columns = ['Day']
df.head()

Unnamed: 0,Day
0,2021-06-25 07:57:07.327145
1,2021-06-24 07:57:07.327145
2,2021-06-23 07:57:07.327145
3,2021-06-22 07:57:07.327145
4,2021-06-21 07:57:07.327145


In [45]:
#getting name of the days & capturing it in a new column
df['weekday'] = df['Day'].dt.strftime('%A')
df['weekday']

0        Friday
1      Thursday
2     Wednesday
3       Tuesday
4        Monday
5        Sunday
6      Saturday
7        Friday
8      Thursday
9     Wednesday
10      Tuesday
11       Monday
12       Sunday
13     Saturday
14       Friday
Name: weekday, dtype: object

In [46]:
df

Unnamed: 0,Day,weekday
0,2021-06-25 07:57:07.327145,Friday
1,2021-06-24 07:57:07.327145,Thursday
2,2021-06-23 07:57:07.327145,Wednesday
3,2021-06-22 07:57:07.327145,Tuesday
4,2021-06-21 07:57:07.327145,Monday
5,2021-06-20 07:57:07.327145,Sunday
6,2021-06-19 07:57:07.327145,Saturday
7,2021-06-18 07:57:07.327145,Friday
8,2021-06-17 07:57:07.327145,Thursday
9,2021-06-16 07:57:07.327145,Wednesday


In [47]:
dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday':7}
dict

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [49]:
df['weekday_ordinal'] = df['weekday'].map(dict)
df

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-06-25 07:57:07.327145,Friday,5
1,2021-06-24 07:57:07.327145,Thursday,4
2,2021-06-23 07:57:07.327145,Wednesday,3
3,2021-06-22 07:57:07.327145,Tuesday,2
4,2021-06-21 07:57:07.327145,Monday,1
5,2021-06-20 07:57:07.327145,Sunday,7
6,2021-06-19 07:57:07.327145,Saturday,6
7,2021-06-18 07:57:07.327145,Friday,5
8,2021-06-17 07:57:07.327145,Thursday,4
9,2021-06-16 07:57:07.327145,Wednesday,3


# 3. Count or Frequency Encoding

In [109]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
                        header = None, index_col= None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [110]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [111]:
#catagories in feature 1
train_set[1].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [112]:
#catagories in feature 14
train_set[14].unique()

array([' <=50K', ' >50K'], dtype=object)

In [113]:
#getting only the categorical features
columns = [1, 3, 5, 6, 7, 8, 9, 13]
train_set = train_set[columns]

#naming the columns
train_set.columns = ['Employment', 'Degree', 'Status', 'Designation', 'Family_job', 'Race', 'Sex', 'Country']
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [114]:
#finding out the no. of categorical values in each feature
for feature in train_set.columns[:]:
    print(feature, ':', len(train_set[feature].unique()), 'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


- There are 42 categories in 'Country' feature. So we will apply count or frequency encoding on it.

In [117]:
#value of each category in Country feature
country_map = train_set['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' Greece': 29,
 ' France': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Trinadad&Tobago': 19,
 ' Cambodia': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

- Now we will replace these frequencies in place on the respective Country categories. This is what Count or Frequency Encoding means.

In [118]:
train_set['Country'] = train_set['Country'].map(country_map)
train_set.head(20)

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


__Advantages :__  
- Easy to use.
- Not increasing feature space

__Disadvantages :__  
- It will provide weight if the frequencies are same.

# 4. Target Guided Ordinal Encoding
- Ordering the labels according to the target.
- Replace the labels by the joint probability of being 1 or 0.

In [119]:
df = pd.read_csv('titanic_train_dataset.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [121]:
#filling NaN of 'Cabin' feature with 'Missing'
df['Cabin'].fillna('Missing', inplace = True)
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [122]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [128]:
#taking the first letter of each Category of 'Cabin' feature
df['Cabin'] = df['Cabin'].astype(str).str[0]
df['Cabin']

0      M
1      C
2      M
3      C
4      M
      ..
886    M
887    B
888    M
889    C
890    M
Name: Cabin, Length: 891, dtype: object

In [127]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [129]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [131]:
#the percentage of the person suvived per Cabin
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [132]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [134]:
#now mapping the labels with rank
ordinal_lebels2 = {k:i for i, k in enumerate(ordinal_labels, 0)}
ordinal_lebels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [135]:
df['Cabin_ordinal_labels'] = df['Cabin'].map(ordinal_lebels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


- Now we can drop the Cabin feature as we have the values of this feature in integer form in Cabin_ordinal_labels feature.

# 5. Mean Encoding

In [136]:
df = pd.read_csv('titanic_train_dataset.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [137]:
#filling NaN of 'Cabin' feature with 'Missing'
df['Cabin'].fillna('Missing', inplace = True)
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [138]:
#taking the first letter of each Category of 'Cabin' feature
df['Cabin'] = df['Cabin'].astype(str).str[0]
df['Cabin']

0      M
1      C
2      M
3      C
4      M
      ..
886    M
887    B
888    M
889    C
890    M
Name: Cabin, Length: 891, dtype: object

In [139]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [141]:
#the percentage of the person suvived per Cabin
df.groupby(['Cabin'])['Survived'].mean().to_dict()

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

- So according to Mean Encoding, wherever there will be A, we will replace that by 0.4666666666666667 and similar for others.

In [142]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [143]:
df['mean_ordinal_encode'] = df['Cabin'].map(mean_ordinal)
df

Unnamed: 0,Survived,Cabin,mean_ordinal_encode
0,0,M,0.299854
1,1,C,0.593220
2,1,M,0.299854
3,1,C,0.593220
4,0,M,0.299854
...,...,...,...
886,0,M,0.299854
887,1,B,0.744681
888,0,M,0.299854
889,1,C,0.593220


- Now we have the replace values for the categorical values of 'Cabin' feature in 'mean_ordinal_encode' feature.

# 6. Probability Ratio Encoding
Steps:  
- Probability of Survived based on Cabin
- Probability of Not Survived
- Probability of Survived / Probability of Not Survived
- Dictionary to map cabin with probability 
- Replace with the categorical feature

In [144]:
df = pd.read_csv('titanic_train_dataset.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [145]:
df['Cabin'].fillna('Missing', inplace = True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [146]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [147]:
df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [148]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [151]:
#% of survived based on cabin
prob_df = df.groupby(['Cabin'])['Survived'].mean()
prob_df

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [152]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [153]:
#probability of the person who are died
prob_df['Died'] = 1 - prob_df['Survived']

In [154]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [155]:
#probability ratio
prob_df['Probability_ratio'] = prob_df['Survived']/prob_df['Died']

In [156]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [161]:
probability_encoded = prob_df['Probability_ratio'].to_dict()
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [160]:
df['Cabin_encoded'] = df['Cabin'].map(probability_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274


- Now 'Cabin' feature can be removed & 'Cabin_encoded' will be used for model creation.