In [25]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")

### 1. One Hot Encoding

In [26]:
df = pd.read_excel(r"C:\Users\91983\Desktop\Machine Learning\Wipro TopGear\Python for Data Science L1 Hands On\titanic.xlsm", usecols = ['Sex'])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [27]:
# Creating new columns using One Hot Encoding
pd.get_dummies(df, drop_first= True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [28]:
df = pd.read_excel(r"C:\Users\91983\Desktop\Machine Learning\Wipro TopGear\Python for Data Science L1 Hands On\titanic.xlsm", usecols = ['Embarked'])
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [29]:
print(df['Embarked'].unique())
df = df['Embarked'].dropna()

['S' 'C' 'Q' nan]


In [31]:
# Creating new columns using One Hot Encoding
pd.get_dummies(df, drop_first= False, prefix = 'Embarked').head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


### 2. One Hot Encoding with Multiple Categories in Feature

In [33]:
df = pd.read_csv('mercedes.csv', usecols = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [34]:
# Let's see the number of categories in each column
for i in df.columns:
    print(f"{i} : {df[i].nunique()}")

X0 : 47
X1 : 27
X2 : 44
X3 : 7
X4 : 4
X5 : 29
X6 : 12


In [35]:
# One way to implement One Hot Encoding for feature with many categories is to select apply One Hot Encoding for n number of highest occuring features only
high_10 = list(df['X1'].value_counts().head(10).index)

# Using 1 for the highest 10 occuring categories and marking other as 0
for i in high_10:
    df[i] = np.where(df['X1'] == i, 1, 0)

df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


### 3. Ordinal Encoding

In [36]:
import datetime

In [39]:
#Capturing today's date
today_date = datetime.datetime.today()
today_date

datetime.datetime(2021, 7, 19, 12, 49, 38, 78607)

In [40]:
# Calculate a new date value by subtracting today date with passed argument in timedelta method
today_date - datetime.timedelta(1)

datetime.datetime(2021, 7, 18, 12, 49, 38, 78607)

In [44]:
# Using List Comprehension to capture last 15 days date
days = [today_date - datetime.timedelta(i) for i in range(15)]

# Converting the list to a dataframe
data = pd.DataFrame(days, columns=['Day'])
data.head()

Unnamed: 0,Day
0,2021-07-19 12:49:38.078607
1,2021-07-18 12:49:38.078607
2,2021-07-17 12:49:38.078607
3,2021-07-16 12:49:38.078607
4,2021-07-15 12:49:38.078607


In [49]:
# Finding out Day value from the Dates and storing in a separate column
data['weekday'] = data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-07-19 12:49:38.078607,Monday
1,2021-07-18 12:49:38.078607,Sunday
2,2021-07-17 12:49:38.078607,Saturday
3,2021-07-16 12:49:38.078607,Friday
4,2021-07-15 12:49:38.078607,Thursday


In [50]:
# Creating dictionary to store weekday and their mapping values
dictionary = {'Monday' : 1,
             'Tuesday' : 2,
             'Wednesday' : 3,
             'Thursday' : 4,
             'Friday' : 5,
             'Saturday' : 6,
             'Sunday' : 7}

data['weekday_ordinal'] = data['weekday'].map(dictionary)
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-07-19 12:49:38.078607,Monday,1
1,2021-07-18 12:49:38.078607,Sunday,7
2,2021-07-17 12:49:38.078607,Saturday,6
3,2021-07-16 12:49:38.078607,Friday,5
4,2021-07-15 12:49:38.078607,Thursday,4


### 4. Count Frequency Encoding

In [52]:
# Reading data from Adult Dataset
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None,index_col=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [64]:
# Capturing the Object columns in a list
columns = list()
for i in data.columns:
    if data[i].dtype == 'object' and i != 14:
        columns.append(i)
        
# Using only Categorical columns for our understanding
data = data[columns]

In [65]:
# Giving self column names for better accessibility
data.columns = ['Employment', 'Degree', 'Status','Designation', 'family_job', 'Race', 'Sex', 'Country']

data.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [66]:
# Checking the Unique values for each column
for i in data.columns:
    print(f"{i} : {data[i].nunique()} labels")

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [69]:
# Let's use Country column for missing value imputation since it has the highest unique labels
# Store the values in dictionary country_map
country_map = data['Country'].value_counts().to_dict()

In [70]:
# Creating separate column with the frequency of the values
data['Country'] = data['Country'].map(country_map)
data.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


#### Advantages:
##### 1. Easy to implement
##### 2. No increase in Feature Space

#### Disadvantage:
##### 1. Provide same weight if 2 categories have same occurance

### 5. Target Guided Ordinal Encoding


##### 1. Order labels according to the target
##### 2. Replace labels by joint probability of being 1 or 0

In [75]:
df = pd.read_excel(r"C:\Users\91983\Desktop\Machine Learning\Wipro TopGear\Python for Data Science L1 Hands On\titanic.xlsm", usecols = ['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [76]:
# Replacing the missing value with Missing statement to capture information
df['Cabin'].fillna('Missing', inplace = True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [77]:
# Capturing the first letter denoting the blocks of the people staying
df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [79]:
# Average people surviving based on Cabin, sorting the values in ascending order and use the index
# Once the index is captured we will store the Index values in a variable
ordinal_label = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_label

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [80]:
# Creating a dictionary based on the frequency of the missing values with respect to Cabin
ordinal_label_dict = {k : i for i, k in enumerate(ordinal_label, 0)}
ordinal_label_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [82]:
# Creating a separate column by mapping ordinal_label_dict
df['Cabin_ordinal_label'] = df['Cabin'].map(ordinal_label_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### 6. Mean Encoding

In [83]:
# We will use the Average value of unique Cabins w.r.t Survived as encoded value

mean_encode_values_dict = df.groupby(['Cabin'])['Survived'].mean().to_dict()
df['mean_encode_values'] = df['Cabin'].map(mean_encode_values_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,mean_encode_values
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
