In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import torch
import tensorflow as tf
%matplotlib inline

### Ordinal Number Encoding

In [None]:
## Creating datafrime for last 14 days
today_date=dt.datetime.today()
days=[today_date-dt.timedelta(x) for x in range(0,15)]
df=pd.DataFrame(days)
df.columns=["Day"]
df.head(3)

Unnamed: 0,Day
0,2024-04-04 08:40:40.777642
1,2024-04-03 08:40:40.777642
2,2024-04-02 08:40:40.777642


In [None]:
df['Weekday']=df['Day'].dt.strftime("%A")
df.head(3)

Unnamed: 0,Day,Weekday
0,2024-04-04 08:40:40.777642,Thursday
1,2024-04-03 08:40:40.777642,Wednesday
2,2024-04-02 08:40:40.777642,Tuesday


In [None]:
rank_list = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7} ## Assigning ranks to the catgeories
rank_list

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [None]:
df['Weekday_Ordinal']=df['Weekday'].map(rank_list)
df.head()

Unnamed: 0,Day,Weekday,Weekday_Ordinal
0,2024-04-04 08:40:40.777642,Thursday,4
1,2024-04-03 08:40:40.777642,Wednesday,3
2,2024-04-02 08:40:40.777642,Tuesday,2
3,2024-04-01 08:40:40.777642,Monday,1
4,2024-03-31 08:40:40.777642,Sunday,7


### Count/Frequncey Encoding

In [None]:
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
relevant_cols=[1,3,5,6,7,8,9,13,14]
df=df[relevant_cols]
df.columns=['Employment','Degree','Marital_Status','Designation','Family_Job','Race','Sex','Country','Salary']
df.head()

Unnamed: 0,Employment,Degree,Marital_Status,Designation,Family_Job,Race,Sex,Country,Salary
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [None]:
for feature in df.columns: ## No. of unique categories per feature
  print(feature + ': ' + str(len(df[feature].unique())))

Employment: 9
Degree: 16
Marital_Status: 7
Designation: 15
Family_Job: 6
Race: 5
Sex: 2
Country: 42
Salary: 2


In [None]:
country_map = df['Country'].value_counts().to_dict() ## Frequency of each category for "Country" feature
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [None]:
df['Country'] = df['Country'].map(country_map) ## Mapping the frequency of each catgeory to that catgeory
df.head()

Unnamed: 0,Employment,Degree,Marital_Status,Designation,Family_Job,Race,Sex,Country,Salary
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95,<=50K


### Target Guided Ordinal Encoding

In [None]:
df=pd.read_csv('./titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [None]:
df['Cabin'].fillna('Missing',inplace=True)

In [None]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [None]:
df['Cabin']=df['Cabin'].str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [None]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [None]:
df.groupby(['Cabin'])['Survived'].mean() ## % of survived passengers for each catgeory of cabin (Or probability)

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [None]:
df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [None]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).index
i=0
ordinal_label_dict={}
for label in ordinal_labels:
  ordinal_label_dict[label]=i
  i=i+1
ordinal_label_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [None]:
df['Cabin_Ordinal_Labels'] = df['Cabin'].map(ordinal_label_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_Labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [None]:
df.groupby(['Cabin'])['Survived'].mean() ## Directly replace with the mean

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [None]:
mean_label_dict = df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_label_dict

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [None]:
df['Cabin_Mean_Labels'] = df['Cabin'].map(mean_label_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_Labels,Cabin_Mean_Labels
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


### Probabilty Ratio Encoding

In [18]:
df=pd.read_csv('./titanic.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [19]:
df['Cabin']=df['Cabin'].fillna('Missing')
df.head(3)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing


In [5]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [20]:
df['Cabin']=df['Cabin'].str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [9]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [10]:
prob_df=df.groupby(['Cabin'])['Survived'].mean() ## Probability of survived for every Cabin category
prob_df

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [11]:
prob_df=pd.DataFrame(prob_df) ## As dataframe
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [12]:
prob_df['Died']=1-prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [14]:
prob_df['Probability_Ratio']=prob_df['Survived']/prob_df['Died'] ## Probabilty Ratio
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [15]:
prob_df['Cabin'] ## Cabin is index not column

KeyError: 'Cabin'

In [16]:
probability_ratio_dict = prob_df['Probability_Ratio'].to_dict()
probability_ratio_dict

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [21]:
df['Cabin_Encoded']=df['Cabin'].map(probability_ratio_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
