In [2]:
# Import dataset as downloaded from kaggle
import pandas as pd

import warnings  # ignore all warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("Titanic-Dataset.csv") # Read dataset from local directory

In [3]:
# Check shape
df.shape

(891, 12)

In [4]:
# Check how look dataset
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Knowing the dataset
"""
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	no. of siblings / spouses aboard the Titanic	
parch	no. of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
"""

'\nsurvival\tSurvival\t0 = No, 1 = Yes\npclass\tTicket class\t1 = 1st, 2 = 2nd, 3 = 3rd\nsex\tSex\t\nAge\tAge in years\t\nsibsp\tno. of siblings / spouses aboard the Titanic\t\nparch\tno. of parents / children aboard the Titanic\t\nticket\tTicket number\t\nfare\tPassenger fare\t\ncabin\tCabin number\t\nembarked\tPort of Embarkation\tC = Cherbourg, Q = Queenstown, S = Southampton\n'

In [6]:
# Check Data Type of Columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# Check missing value
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# check data look mathematically
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Check duplicate values
df.duplicated().sum()

np.int64(0)

In [10]:
# Handle missing values of object values using sklearn SimpleImputer
from sklearn.impute import SimpleImputer
sim_obj = SimpleImputer(strategy='most_frequent')
df[['Cabin','Embarked']] = sim_obj.fit_transform(df[['Cabin','Embarked']])

In [11]:
# Handle missing values of object values using sklearn SimpleImputer
sim_int = SimpleImputer(strategy='mean')
df[['Age']] = sim_int.fit_transform(df[['Age']])
df['Age'] = df['Age'].astype('int64')

In [12]:
# Check missing value
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [13]:
# Save clean data
df.to_csv('new_df.csv')

In [14]:
# import clean data
df = pd.read_csv('new_df.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,B96 B98,S


In [15]:
# Encoding Name Columns using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le_name = LabelEncoder()
df['Name'] = le_name.fit_transform(df[['Name']])

In [16]:
# Encoding Cabin Columns using LabelEncoder
le_cabin = LabelEncoder()
df['Cabin'] = le_cabin.fit_transform(df[['Cabin']])

In [17]:
# Encoding Ticket Columns using LabelEncoder
le_tckt = LabelEncoder()
df['Ticket'] = le_tckt.fit_transform(df[['Ticket']])

In [18]:
# Encoding Sex Columns using map function
o_sex = {'male':0,'female':1}
df['Sex'] = df['Sex'].map(o_sex)
df['Sex'] = df['Sex'].astype('int64') # change dtype float to int 

In [19]:
# Encoding Embarked Columns using map function
o_embark = {'C':0,'Q':1,'S':2}
df['Embarked'] = df['Embarked'].map(o_embark)
df['Embarked'] = df['Embarked'].astype('int64') # change dtype object to int 

In [20]:
# Check Data Type of Columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    int64  
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    int64  
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int64  
 11  Embarked     891 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 83.7 KB


In [21]:
# After Enconding,all categorical values covert into numerical values
print('<--------------------------------------------------Data Cleaning Done--------------------------------------------------->')
df.head(10)

<--------------------------------------------------Data Cleaning Done--------------------------------------------------->


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,0,22,1,0,523,7.25,47,2
1,2,1,1,190,1,38,1,0,596,71.2833,81,0
2,3,1,3,353,1,26,0,0,669,7.925,47,2
3,4,1,1,272,1,35,1,0,49,53.1,55,2
4,5,0,3,15,0,35,0,0,472,8.05,47,2
5,6,0,3,554,0,29,0,0,275,8.4583,47,1
6,7,0,1,515,0,54,0,0,85,51.8625,129,2
7,8,0,3,624,0,2,3,1,395,21.075,47,2
8,9,1,3,412,1,27,0,2,344,11.1333,47,2
9,10,1,2,576,1,14,1,0,132,30.0708,47,0
