# Target guided ordinal encoding
- ordering the labels according to the target
- Replace the labels by the joint probabilityof being 1 or 0

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.pandas.set_option("display.max_columns",None)
pd.pandas.set_option("display.max_rows",None)

In [2]:
df = pd.read_csv("titanic.csv", usecols=["Cabin", "Survived"])

In [3]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [4]:
df.isnull().sum()

Survived      0
Cabin       687
dtype: int64

In [5]:
df["Cabin"].fillna("Missing", inplace=True)

In [6]:
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [7]:
df["Cabin"].value_counts()

Missing            687
B96 B98              4
G6                   4
C23 C25 C27          4
F33                  3
E101                 3
D                    3
C22 C26              3
F2                   3
D33                  2
D36                  2
E33                  2
E8                   2
C83                  2
C78                  2
C65                  2
B49                  2
B5                   2
C92                  2
B51 B53 B55          2
C124                 2
E121                 2
B22                  2
B35                  2
C68                  2
D17                  2
C93                  2
F G73                2
C126                 2
C123                 2
D26                  2
C125                 2
E67                  2
C52                  2
B20                  2
B77                  2
F4                   2
E24                  2
E25                  2
C2                   2
B58 B60              2
B18                  2
D20                  2
D35        

In [8]:
df["Cabin"] = df["Cabin"].astype(str).str[0]

In [9]:
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [10]:
df["Cabin"].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [11]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [12]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [13]:
enumerate(ordinal_labels,0)  # It will provide ranks to the labelsb\

<enumerate at 0x7ff1b3664f40>

In [14]:
# providing the rank based on more survivals
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}  # here 'k' is for the label and 'i' is for the rank. here we used 0 bcz we need to start giving rank with 0
ordinal_labels2   # In the output 'D' is assigned highest rank as D has more survived chances

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [15]:
df["Cabin_ordinal_labels"] = df['Cabin'].map(ordinal_labels2)

In [16]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


In [17]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
5,0,M,1
6,0,E,7
7,0,M,1
8,1,M,1
9,1,M,1


# Mean Encoding

In [19]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [21]:
df["Mean_Ordinal_encoding"] = df["Cabin"].map(mean_ordinal)
df.head(10)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,Mean_Ordinal_encoding
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
5,0,M,1,0.299854
6,0,E,7,0.75
7,0,M,1,0.299854
8,1,M,1,0.299854
9,1,M,1,0.299854
