# Imports 

In [1]:
import numpy as np
import pandas as pd
import datetime

### Method 3: Ordinal Number encoding

In [2]:
today_date = datetime.datetime.today()

In [3]:
today_date

datetime.datetime(2021, 6, 17, 22, 48, 6, 88789)

In [4]:
today_date - datetime.timedelta(2)

datetime.datetime(2021, 6, 15, 22, 48, 6, 88789)

In [5]:
# List comprehension:
days = [today_date - datetime.timedelta(x) for x in range(15)]

In [6]:
data = pd.DataFrame(days,columns=['Day'])

In [7]:
data.head()

Unnamed: 0,Day
0,2021-06-17 22:48:06.088789
1,2021-06-16 22:48:06.088789
2,2021-06-15 22:48:06.088789
3,2021-06-14 22:48:06.088789
4,2021-06-13 22:48:06.088789


In [8]:
data['Weekday'] = data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,Weekday
0,2021-06-17 22:48:06.088789,Thursday
1,2021-06-16 22:48:06.088789,Wednesday
2,2021-06-15 22:48:06.088789,Tuesday
3,2021-06-14 22:48:06.088789,Monday
4,2021-06-13 22:48:06.088789,Sunday


In [9]:
dictionary = {'Sunday':7,
             'Monday':1,
             'Tuesday':2,
             'Wednesday':3,
             'Thursday':4,
             'Friday':5,
             'Saturday':6}

In [10]:
dictionary

{'Sunday': 7,
 'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6}

In [11]:
data['Weekday_ordinal'] = data['Weekday'].map(dictionary)

In [12]:
data.head()

Unnamed: 0,Day,Weekday,Weekday_ordinal
0,2021-06-17 22:48:06.088789,Thursday,4
1,2021-06-16 22:48:06.088789,Wednesday,3
2,2021-06-15 22:48:06.088789,Tuesday,2
3,2021-06-14 22:48:06.088789,Monday,1
4,2021-06-13 22:48:06.088789,Sunday,7


### Method 4 : Count or Frequency encoding

In [13]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)

In [14]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
columns = [1,3,5,6,7,8,9,13]

In [16]:
train_set = train_set[columns]

In [17]:
train_set.head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [18]:
train_set.columns = ['Employeement','Degree','Status','Designation',
                     'Family_job','Race','Sex','Country'] 

In [19]:
train_set.head()

Unnamed: 0,Employeement,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [20]:
for col in train_set.columns:
    print(col ," : ",len(train_set[col].unique())," labels")

Employeement  :  9  labels
Degree  :  16  labels
Status  :  7  labels
Designation  :  15  labels
Family_job  :  6  labels
Race  :  5  labels
Sex  :  2  labels
Country  :  42  labels


In [21]:
len(train_set['Degree'].unique())

16

In [22]:
train_set['Country'].value_counts().sort_values(ascending=False).index[0]

' United-States'

In [23]:
country_map = train_set['Country'].value_counts().to_dict()

In [24]:
#Replacing it with the number of occurences:
train_set['Country'] = train_set['Country'].map(country_map)

In [25]:
train_set.head()

Unnamed: 0,Employeement,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


#### Advantage:
1) Does not need additional column
#### Disadvantage:
1) It will provide same weight, if frequency is same. 

## Method 5: Target guided ordinal encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint prob of being 1 or 0

In [26]:
Titanic = pd.read_csv('train.csv',usecols=['Cabin','Survived'])

In [27]:
Titanic.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [28]:
Titanic['Cabin'].fillna("Missing",inplace=True)

In [60]:
Titanic.head(10)

Unnamed: 0,Survived,Cabin_map_lables
0,0,1
1,1,4
2,1,1
3,1,4
4,0,1
5,0,1
6,0,7
7,0,1
8,1,1
9,1,1


In [30]:
#Taking the 1st letter in the Cabin column:
Titanic['Cabin'] = Titanic['Cabin'].astype(str).str[0]

In [31]:
Titanic.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [37]:
#Giving the percent of ppl survived in each cabin:
Titanic.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True)

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [38]:
Titanic.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [39]:
ordinal_labels =  Titanic.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).index

In [53]:
ordinal_labels_2 = {k:i for i,k in enumerate(ordinal_labels,0)}

In [54]:
ordinal_labels_2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [57]:
Titanic['Cabin_map_lables'] =  Titanic['Cabin'].map(ordinal_labels_2)

In [58]:
Titanic.head()

Unnamed: 0,Survived,Cabin,Cabin_map_lables
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


In [59]:
Titanic.drop(['Cabin'],axis=1,inplace=True)
Titanic.head()

Unnamed: 0,Survived,Cabin_map_lables
0,0,1
1,1,4
2,1,1
3,1,4
4,0,1


## Method 6: Mean encoding

In [65]:
Titanic = pd.read_csv('train.csv',usecols=['Cabin','Survived'])
Titanic['Cabin'] = Titanic['Cabin'].astype(str).str[0]

In [68]:
Map_values = Titanic.groupby(['Cabin'])['Survived'].mean().to_dict()

In [69]:
Titanic['Cabin_value'] = Titanic['Cabin'].map(Map_values)

In [70]:
Titanic.head()

Unnamed: 0,Survived,Cabin,Cabin_value
0,0,n,0.299854
1,1,C,0.59322
2,1,n,0.299854
3,1,C,0.59322
4,0,n,0.299854


#### Disadvantage :
1. Can lead to overfitting