Day 3

# Handling Categorical Features

In [25]:
import pandas as pd
df=pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### One Hot Encoding

Here we use .dummies() to implement, it replaces with 0's and 1's in the respective categories and we drop one of the column

In [26]:
df_one_hot=pd.read_csv('titanic.csv',usecols=['Sex'])
df_one_hot.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [27]:
#we use pandas to get the dummies and supply the data set to do so
pd.get_dummies(df_one_hot).head()

#here the Sex_female and Sex_male is created as only there are 0 ans 1 we can use only one column in this and discard the other


Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [28]:
#here the Sex_female and Sex_male is created as only there are 0 ans 1 we can use only one column in
#this and discard the other there fore we use drop_first attribute 

pd.get_dummies(df_one_hot,drop_first=1).head()


Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [30]:
# ['Embarked'] feature has more categories

df_one_hot=pd.read_csv('titanic.csv',usecols=['Embarked'])
df_one_hot.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [32]:
df_one_hot['Embarked'].dropna().unique()

array(['S', 'C', 'Q'], dtype=object)

In [35]:
pd.get_dummies(df_one_hot,drop_first=True).head()
# here we have dropped Embarked_C column and this creates [no of categories = no of columns] 

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### One Hot Encoding with more categorical features

In [38]:
df_merc_onehot=pd.read_csv('merc.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
df_merc_onehot.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [44]:
for i in df_merc_onehot.columns:
    print(len(df_merc_onehot[i].unique()))

# we got the no of columns in each columns 

47
27
44
7
4
29
12


KDD cup orange challenge where we take first 10 most repeating categories and apply one hot encoding
Ensembled technique

In [46]:
df_merc_onehot.X1.value_counts(ascending=False).head(10)
# getting the top 10 most frequent categories for one hot encoding

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [49]:
# storing the values of top 10 categories in one place as list
lst_10=df_merc_onehot.X1.value_counts(ascending=False).head(10).index
lst_10=list(lst_10)
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [50]:
import numpy as np
for categories in lst_10:
    df_merc_onehot[categories]=np.where(df_merc_onehot['X1']==categories,1,0)

In [52]:
lst_10.append('X1')
df_merc_onehot[lst_10].head()

# the variable present in the X1 is replaced by 1 and the rest is 0

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


Day4

### Ordinal Number Encoding

Where thr categories is ordered in sach a way that it can assign rank

In [1]:
import datetime

In [9]:
today_date=datetime.datetime.today()
today_date

datetime.datetime(2023, 5, 18, 16, 40, 31, 754689)

In [14]:
## Lisyt Comprehension

# we are creating a list of 15 days from today
days=[today_date-datetime.timedelta(x) for x in range(0,15)] # timedelta() is used to subtract today eith the no specified date

In [24]:
data=pd.DataFrame(days)
data.columns=["Day"]

In [25]:
data.head()

Unnamed: 0,Day
0,2023-05-18 16:40:31.754689
1,2023-05-17 16:40:31.754689
2,2023-05-16 16:40:31.754689
3,2023-05-15 16:40:31.754689
4,2023-05-14 16:40:31.754689


In [34]:
data['Weekday']=data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,Weekday
0,2023-05-18 16:40:31.754689,Thursday
1,2023-05-17 16:40:31.754689,Wednesday
2,2023-05-16 16:40:31.754689,Tuesday
3,2023-05-15 16:40:31.754689,Monday
4,2023-05-14 16:40:31.754689,Sunday


In [38]:
# creating the dictionary
dictionary={'Monday':1,"Tuesday":2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [39]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [41]:
data['weekday_ordinal']=data['Weekday'].map(dictionary)
data.head()

Unnamed: 0,Day,Weekday,weekday_ordinal
0,2023-05-18 16:40:31.754689,Thursday,4
1,2023-05-17 16:40:31.754689,Wednesday,3
2,2023-05-16 16:40:31.754689,Tuesday,2
3,2023-05-15 16:40:31.754689,Monday,1
4,2023-05-14 16:40:31.754689,Sunday,7


### Count / Frequency Encoding



In [44]:
train_set=pd.read_csv('adult.csv',header=None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [48]:
columns=[1,3,5,6,7,8,9,13]   #selecting the ordinal categorical columns
train_set=train_set[columns]
train_set.head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [50]:
# Naming the features
train_set.columns=['Employment','Degree','Status','Designation','Family_Job','Race','Sex','Country']
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [54]:
for category in train_set:
    print(category,':',len(train_set[category].value_counts().unique()))
# Getting the number of labels in the each category

Employment : 9
Degree : 16
Status : 7
Designation : 15
Family_Job : 6
Race : 5
Sex : 2
Country : 38


In [58]:
Country_map=train_set['Country'].value_counts().to_dict() # converting the value and the categories to dictionaries format

In [59]:
train_set['Country_map']=train_set['Country'].map(Country_map)

In [60]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country,Country_map
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,95


### Target Guided Ordinal Encoding

1.Ordering Lables according to the target

2.Replace the labels by the joint probability of being 1's and 0's

In [2]:
df_target_encode=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df_target_encode.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [3]:
df_target_encode['Cabin'].fillna('Missing')  # filling the NaN values with "missing" 

0      Missing
1          C85
2      Missing
3         C123
4      Missing
        ...   
886    Missing
887        B42
888    Missing
889       C148
890    Missing
Name: Cabin, Length: 891, dtype: object

In [26]:
df_target_encode['Cabin'].astype(str).str[0] # here we got the first letter of the cabin which tells the class of the ticket

0      n
1      C
2      n
3      C
4      n
      ..
886    n
887    B
888    n
889    C
890    n
Name: Cabin, Length: 891, dtype: object

In [5]:
df_target_encode['Cabin']=df_target_encode['Cabin'].astype(str).str[0]
df_target_encode.head()

Unnamed: 0,Survived,Cabin
0,0,n
1,1,C
2,1,n
3,1,C
4,0,n


In [6]:
df_target_encode.Cabin.unique() # gives all the categories

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [7]:
df_target_encode.groupby(['Cabin'])['Survived'].mean() # grouping wrt cabin and mean of survived

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
T    0.000000
n    0.299854
Name: Survived, dtype: float64

In [8]:
df_target_encode.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
n    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [11]:
ordinal_labels=df_target_encode.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'n', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [15]:
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)} #enumerate is itrational function
ordinal_labels2
# this is used to map all the labels to ranks based on frequency

{'T': 0, 'n': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [16]:
df_target_encode['Cabin_ordinal']=df_target_encode['Cabin'].map(ordinal_labels2)
df_target_encode

Unnamed: 0,Survived,Cabin,Cabin_ordinal
0,0,n,1
1,1,C,4
2,1,n,1
3,1,C,4
4,0,n,1
...,...,...,...
886,0,n,1
887,1,B,6
888,0,n,1
889,1,C,4


### Mean Encoding

It is similar to the target guided encoding only difference is we replace the values with mean() value than rank

In [17]:
df_mean_encode=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df_mean_encode.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [18]:
df_mean_encode['Cabin']=df_mean_encode['Cabin'].fillna('Missing')
df_mean_encode.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [23]:
df_mean_encode['Cabin'].astype(str).str[0]

0      M
1      C
2      M
3      C
4      M
      ..
886    M
887    B
888    M
889    C
890    M
Name: Cabin, Length: 891, dtype: object

In [25]:
df_mean_encode['Cabin']=df_mean_encode['Cabin'].astype(str).str[0]
df_mean_encode.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [28]:
df_mean_encode.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [32]:
mean_values=df_mean_encode.groupby(['Cabin'])['Survived'].mean()

In [34]:
mean_dict=mean_values.to_dict()
mean_dict

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [35]:
df_mean_encode['Mean_ordinal']=df_mean_encode['Cabin'].map(mean_dict)
df_mean_encode

Unnamed: 0,Survived,Cabin,Mean_ordinal
0,0,M,0.299854
1,1,C,0.593220
2,1,M,0.299854
3,1,C,0.593220
4,0,M,0.299854
...,...,...,...
886,0,M,0.299854
887,1,B,0.744681
888,0,M,0.299854
889,1,C,0.593220


### Probability Ratio Encoding

1. find probability of survived based on cabin
2. probability of not survived (1-Survived)
3. prob(survived)/prob(not survived)
4. convert to dictionary
5. map the dictionary to df of cabin

In [49]:
df_probab_encode=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df_probab_encode['Cabin']=df_probab_encode['Cabin'].fillna('Missing')
df_probab_encode['Cabin']=df_probab_encode['Cabin'].astype(str).str[0]
df_probab_encode.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [50]:
prob_values=df_mean_encode.groupby(['Cabin'])['Survived'].mean()

In [51]:
prob_df=pd.DataFrame(prob_values)

In [52]:
prob_df

# here we can observe the the survived column is updated to the % of survived based on the cabin values

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [53]:
prob_df["Died"]=1-prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


Formula for probability Ratio is (% of survived)/(% of died)

In [54]:
prob_df['prob_ratio']=prob_df['Survived']/prob_df['Died']
prob_df

Unnamed: 0_level_0,Survived,Died,prob_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [55]:
encode_prob=prob_df['prob_ratio'].to_dict()
encode_prob

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [58]:
df_probab_encode['Cabin_encoded']=df_probab_encode['Cabin'].map(encode_prob)

In [59]:
df_probab_encode

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
