## Ordinal Number Encoding

In [151]:
import datetime

In [152]:
today = datetime.datetime.today()
today

datetime.datetime(2024, 2, 1, 20, 11, 25, 273901)

In [153]:
today - datetime.timedelta(1)

datetime.datetime(2024, 1, 31, 20, 11, 25, 273901)

In [154]:
### List Comprehension

days = [today - datetime.timedelta(x) for x in range(0,15)]

In [155]:
import pandas as pd
data = pd.DataFrame(days)
data.columns = ['Day']

In [156]:
data.head()

Unnamed: 0,Day
0,2024-02-01 20:11:25.273901
1,2024-01-31 20:11:25.273901
2,2024-01-30 20:11:25.273901
3,2024-01-29 20:11:25.273901
4,2024-01-28 20:11:25.273901


In [157]:
data['weekday'] = data['Day'].dt.day_name()

In [158]:
data.head()

Unnamed: 0,Day,weekday
0,2024-02-01 20:11:25.273901,Thursday
1,2024-01-31 20:11:25.273901,Wednesday
2,2024-01-30 20:11:25.273901,Tuesday
3,2024-01-29 20:11:25.273901,Monday
4,2024-01-28 20:11:25.273901,Sunday


In [159]:
dictionary = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday': 6, 'Sunday':7 } 

In [160]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [161]:
data['weekday_ordinal'] = data['weekday'].map(dictionary)

In [162]:
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2024-02-01 20:11:25.273901,Thursday,4
1,2024-01-31 20:11:25.273901,Wednesday,3
2,2024-01-30 20:11:25.273901,Tuesday,2
3,2024-01-29 20:11:25.273901,Monday,1
4,2024-01-28 20:11:25.273901,Sunday,7


## Count or Frequency Encoding

In [163]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None) 
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [164]:
columns = [1,3,5,6,7,8,9,13]

In [165]:
data = train_set[columns]

In [166]:
data.columns = ['Employment', 'Degree', 'Status', 'Designation', 'Family_job', 'Race', 'Sex', 'Country']

In [167]:
data.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [168]:
for feature in data.columns[:]:
    print(feature, ":", len(data[feature].unique()), "labels")

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [169]:
country_map = data['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [170]:
data['Country'] = data['Country'].map(country_map)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Country'] = data['Country'].map(country_map)


Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


##### Advantages
1. Easy To Use
2. Not increasing feature space
##### Disadvantages
1. It will provide same weight if the frequencies are same

## Target Guided Ordial Encoding

1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [171]:
import pandas as pd 
df = pd.read_csv('titanic.csv', usecols = ['Cabin', 'Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [172]:
df['Cabin'].fillna('Missing', inplace = True)

In [173]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [174]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [175]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [176]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [177]:
df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending = False)

Cabin
D    0.757576
E    0.750000
B    0.744681
F    0.615385
C    0.593220
G    0.500000
A    0.466667
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [178]:
Ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
Ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [179]:
ordinal_labels2 = {k:i for i,k in enumerate(Ordinal_labels, 0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

#### enumerate(Ordinal_labels, 0): This function is used to iterate over the elements of the Ordinal_labels list along with their index. The 0 specifies the starting index.

#### k: i for i, k in enumerate(Ordinal_labels, 0): This part of the code creates key-value pairs in the dictionary. For each element k in the Ordinal_labels list and its corresponding index i, a key-value pair is formed where k is the key, and i is the value.

#### Another example 
Ordinal_label = ['low', 'medium', 'high']
ordinal_labels3 = {k: i for i, k in enumerate(Ordinal_label, 0)}

print(ordinal_labels3)


In [180]:
df['Cabin_ordinal_labels'] = df['Cabin'].map(ordinal_labels2)

In [183]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
5,0,M,1
6,0,E,7
7,0,M,1
8,1,M,1
9,1,M,1


In [184]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [185]:
df['mean_ordinal_encode'] = df['Cabin'].map(mean_ordinal)

In [186]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
5,0,M,1,0.299854
6,0,E,7,0.75
7,0,M,1,0.299854
8,1,M,1,0.299854
9,1,M,1,0.299854
