## Oridinal Number Encoding

In [1]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
today_date = datetime.datetime.today() 

In [3]:
today_date

datetime.datetime(2024, 1, 9, 11, 3, 46, 914488)

In [4]:
today_date - datetime.timedelta(2)

datetime.datetime(2024, 1, 7, 11, 3, 46, 914488)

In [5]:
days = [today_date - datetime.timedelta(x) for x in range(15)]

In [6]:
df = pd.DataFrame(days)
df.columns=['Days']

In [7]:
df

Unnamed: 0,Days
0,2024-01-09 11:03:46.914488
1,2024-01-08 11:03:46.914488
2,2024-01-07 11:03:46.914488
3,2024-01-06 11:03:46.914488
4,2024-01-05 11:03:46.914488
5,2024-01-04 11:03:46.914488
6,2024-01-03 11:03:46.914488
7,2024-01-02 11:03:46.914488
8,2024-01-01 11:03:46.914488
9,2023-12-31 11:03:46.914488


In [8]:
df['weekday']=df['Days'].dt.day_name()
df.head()

Unnamed: 0,Days,weekday
0,2024-01-09 11:03:46.914488,Tuesday
1,2024-01-08 11:03:46.914488,Monday
2,2024-01-07 11:03:46.914488,Sunday
3,2024-01-06 11:03:46.914488,Saturday
4,2024-01-05 11:03:46.914488,Friday


In [9]:
df['weekday_no']=df['Days'].dt.weekday
df.head(8)

Unnamed: 0,Days,weekday,weekday_no
0,2024-01-09 11:03:46.914488,Tuesday,1
1,2024-01-08 11:03:46.914488,Monday,0
2,2024-01-07 11:03:46.914488,Sunday,6
3,2024-01-06 11:03:46.914488,Saturday,5
4,2024-01-05 11:03:46.914488,Friday,4
5,2024-01-04 11:03:46.914488,Thursday,3
6,2024-01-03 11:03:46.914488,Wednesday,2
7,2024-01-02 11:03:46.914488,Tuesday,1


In [10]:
dictionary = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [11]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [12]:
df['weekday_ordinal'] = df['weekday'].map(dictionary)

In [13]:
df.head(10)

Unnamed: 0,Days,weekday,weekday_no,weekday_ordinal
0,2024-01-09 11:03:46.914488,Tuesday,1,2
1,2024-01-08 11:03:46.914488,Monday,0,1
2,2024-01-07 11:03:46.914488,Sunday,6,7
3,2024-01-06 11:03:46.914488,Saturday,5,6
4,2024-01-05 11:03:46.914488,Friday,4,5
5,2024-01-04 11:03:46.914488,Thursday,3,4
6,2024-01-03 11:03:46.914488,Wednesday,2,3
7,2024-01-02 11:03:46.914488,Tuesday,1,2
8,2024-01-01 11:03:46.914488,Monday,0,1
9,2023-12-31 11:03:46.914488,Sunday,6,7


## Count or Frequency Encoding

In [14]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
len(train_set[1].unique())

9

In [16]:
col = [1,3,5,6,7,8,9,13]

In [17]:
data = train_set[col]

In [18]:
data

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [19]:
data.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [20]:
data.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [21]:
for feature in data.columns[:]:
    print(f"{feature} : {len(data[feature].unique())} lables")

Employment : 9 lables
Degree : 16 lables
Status : 7 lables
Designation : 15 lables
family_job : 6 lables
Race : 5 lables
Sex : 2 lables
Country : 42 lables


In [22]:
dict1 = data['Country'].value_counts().to_dict()

In [23]:
data['Country'] = data['Country'].map(dict1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Country'] = data['Country'].map(dict1)


In [24]:
data.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


In [25]:
##  Target guided oridinal encoding 

In [26]:
df = pd.read_csv("Datasets/titanic.csv", usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [27]:
df['Cabin'].fillna("Missing",inplace=True)

In [28]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [29]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [30]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [32]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [33]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [34]:
oridinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index

In [35]:
oridinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [36]:
oridinal_lables2 = {k:i for i,k in enumerate(oridinal_labels,0)}

In [39]:
oridinal_lables2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [40]:
df['Oridinal_labels_cabin'] = df['Cabin'].map(oridinal_lables2)

In [41]:
df.head(10)

Unnamed: 0,Survived,Cabin,Oridinal_labels_cabin
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
5,0,M,1
6,0,E,7
7,0,M,1
8,1,M,1
9,1,M,1


## Mean Encoding 

In [42]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [43]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [44]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Oridinal_labels_cabin,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
