In [1]:
# Loading the dataset 

import seaborn as sns
import pandas as pd

In [2]:
titanic = sns.load_dataset('titanic')

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [6]:
#Let's peek at some categorical features in our data.
titanic['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [7]:
titanic['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [8]:
titanic['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [9]:
titanic['who'].value_counts()

man      537
woman    271
child     83
Name: who, dtype: int64

In [10]:
titanic['adult_male'].value_counts()

True     537
False    354
Name: adult_male, dtype: int64

In [11]:
titanic['embark_town'].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [12]:
titanic['alone'].value_counts()

True     537
False    354
Name: alone, dtype: int64

In [13]:
titanic['deck'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [14]:
# Mapping Method
map_dict = {
    'First':0,
    'Second': 1,
    'Third': 2
}

In [15]:
titanic['class'] = titanic['class'].map(map_dict)

In [16]:
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,2,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,0,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,2,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,0,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,2,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,2,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,0,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,2,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,2,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,1,child,False,,Cherbourg,yes,False


In [17]:
titanic['class'].value_counts()
#As you can see, the class feature is encoded. Everywhere the class was First, it was replaced with 0. Samething happened to other classes.


2    491
0    216
1    184
Name: class, dtype: int64

In [18]:
#Ordinary Encoding
#This will also convert categorical data into numbers. Let's implement it
from sklearn.preprocessing import OrdinalEncoder

cats_feats = titanic[['alive', 'alone']]

encoder = OrdinalEncoder()

cats_encoded = encoder.fit_transform(cats_feats)

In [19]:
#The output of the encoder is a NumPy array. We can convert it back to the pandas dataframe.
titanic[['alive', 'alone']] = pd.DataFrame(cats_encoded, columns=cats_feats.columns, index=cats_feats.index)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,2,man,True,,Southampton,0.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,0,woman,False,C,Cherbourg,1.0,0.0
2,1,3,female,26.0,0,0,7.925,S,2,woman,False,,Southampton,1.0,1.0
3,1,1,female,35.0,1,0,53.1,S,0,woman,False,C,Southampton,1.0,0.0
4,0,3,male,35.0,0,0,8.05,S,2,man,True,,Southampton,0.0,1.0


In [20]:
encoder.categories_

[array(['no', 'yes'], dtype=object), array([False,  True])]

In [22]:
#Label Encoding
#drop all missing values.
titanic = sns.load_dataset('titanic')

titanic_cleaned = titanic.dropna()

In [23]:
titanic_cleaned.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [24]:
from sklearn.preprocessing import LabelEncoder

deck_feat = titanic_cleaned[['deck']]

label_encoder = LabelEncoder()

deck_encoded = label_encoder.fit_transform(deck_feat)

  y = column_or_1d(y, warn=True)


In [25]:
#Same as ordinary encoder, the output of Label Encoder is a NumPy array.
titanic_cleaned['deck'] = pd.DataFrame(deck_encoded, columns=deck_feat.columns, index=deck_feat.index)

titanic_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_cleaned['deck'] = pd.DataFrame(deck_encoded, columns=deck_feat.columns, index=deck_feat.index)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,2,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,2,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,4,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,6,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,2,Southampton,yes,True


In [26]:
label_encoder.classes_

array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object)

In [27]:
titanic_cleaned['deck'].value_counts()

2    51
1    43
3    31
4    30
0    12
5    11
6     4
Name: deck, dtype: int64

In [28]:
#Pandas Dummies
dummies = pd.get_dummies(titanic['who'], drop_first=True)

In [29]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [30]:
titanic = pd.concat([titanic.drop('who',axis=1),dummies],axis=1)

In [31]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone,man,woman
0,0,3,male,22.0,1,0,7.25,S,Third,True,,Southampton,no,False,1,0
1,1,1,female,38.0,1,0,71.2833,C,First,False,C,Cherbourg,yes,False,0,1
2,1,3,female,26.0,0,0,7.925,S,Third,False,,Southampton,yes,True,0,1
3,1,1,female,35.0,1,0,53.1,S,First,False,C,Southampton,yes,False,0,1
4,0,3,male,35.0,0,0,8.05,S,Third,True,,Southampton,no,True,1,0


In [32]:
# Or you can do it at once with this code

#titanic[['man', 'woman']] = pd.get_dummies(titanic['who'], drop_first=True)

In [33]:
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

town_encoded = one_hot.fit_transform(titanic_cleaned[['embark_town']])

In [34]:
one_hot.categories_

[array(['Cherbourg', 'Queenstown', 'Southampton'], dtype=object)]

In [35]:
town_encoded

<182x3 sparse matrix of type '<class 'numpy.float64'>'
	with 182 stored elements in Compressed Sparse Row format>

In [36]:
#The output of One hot encoder is a sparse matrix. We will need to convert it into NumPy array.
town_encoded = town_encoded.toarray()

In [37]:
columns = list(one_hot.categories_)

town_df = pd.DataFrame(town_encoded, columns =columns)

town_df.head()

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [38]:
len(town_df)

182

In [39]:
len(titanic_cleaned)

182

In [40]:
drop_embark = titanic_cleaned.drop('embark_town',axis=1)

drop_embark[['Cherbourg', 'Queenstown', 'Southampton']] = town_df

In [41]:
drop_embark.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alive,alone,Cherbourg,Queenstown,Southampton
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,2,yes,False,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,2,yes,False,0.0,0.0,1.0
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,4,no,True,0.0,0.0,1.0
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,6,yes,False,0.0,0.0,1.0
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,2,yes,True,0.0,0.0,1.0
