# One Hot Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('../Datasets/titanic.csv')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
new = pd.get_dummies(data=data)

In [5]:
new.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,who_child,who_man,who_woman,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0


In [6]:
data.shape

(891, 15)

In [7]:
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoder

In [8]:
encode_col = ['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive', 'adult_male', 'alone']

In [9]:
# Perform one-hot encoding on the specified column
onehot_encoded = onehot_encoder.fit_transform(data[encode_col])

In [10]:
onehot_encoded.shape

(891, 30)

In [11]:
onehot_encoded

array([[0., 1., 0., ..., 1., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 0., ..., 1., 0., 1.]])

In [12]:
# Getting the column names for newly created columns

# Get the categories for each feature
categories = onehot_encoder.categories_

# Create a list of column names for the one-hot encoded features
column_names = []
for i, category in enumerate(categories):
    column_names.extend([f'{onehot_encoder.get_feature_names_out()[i]}_{value}' for value in category])

In [13]:
# Create a DataFrame from the one-hot encoded data
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=column_names)
onehot_encoded_df

Unnamed: 0,sex_female_female,sex_female_male,sex_male_C,sex_male_Q,sex_male_S,sex_male_nan,embarked_C_First,embarked_C_Second,embarked_C_Third,embarked_Q_child,embarked_Q_man,embarked_Q_woman,embarked_S_A,embarked_S_B,embarked_S_C,embarked_S_D,embarked_S_E,embarked_S_F,embarked_S_G,embarked_S_nan,embarked_nan_Cherbourg,embarked_nan_Queenstown,embarked_nan_Southampton,embarked_nan_nan,class_First_no,class_First_yes,class_Second_0.0,class_Second_1.0,class_Third_0.0,class_Third_1.0
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
6,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [14]:
onehot_encoded_df.shape

(891, 30)

In [15]:
# Creating a copy of original df
data_copy = data.copy()

# Dropping those columns which will be encoded
data_copy.drop(encode_col, axis=1, inplace=True)

In [16]:
data_copy = pd.concat([data_copy, onehot_encoded_df], axis=1)

In [17]:
data_copy.shape

(891, 36)

In [18]:
data_copy.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female_female,sex_female_male,sex_male_C,sex_male_Q,sex_male_S,sex_male_nan,embarked_C_First,embarked_C_Second,embarked_C_Third,embarked_Q_child,embarked_Q_man,embarked_Q_woman,embarked_S_A,embarked_S_B,embarked_S_C,embarked_S_D,embarked_S_E,embarked_S_F,embarked_S_G,embarked_S_nan,embarked_nan_Cherbourg,embarked_nan_Queenstown,embarked_nan_Southampton,embarked_nan_nan,class_First_no,class_First_yes,class_Second_0.0,class_Second_1.0,class_Third_0.0,class_Third_1.0
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
