In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Cleaning on Titanic Dataset

In [4]:
#Check for missing data (Nan = Not a Number/blank)
print(df.isnull().sum()) # shows how many missing values each columns has

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
#Filling missing age with average(mean) age
df['Age']= df['Age'].fillna(df['Age'].mean())

In [6]:
#Filling missing embarked port with most common port
df['Embarked']= df['Embarked'].fillna(df['Embarked'].mode()[0]) # mode()[0] gives the most frequent value

In [7]:
#Check again for missing data (Nan = Not a Number/blank) # Ignoring cabin
print(df.isnull().sum()) # shows how many missing values each columns has

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


Encoding - Turning Text into numbers

In [8]:
#Convert 'Sex' columnt(male and female) into numbers using label encoding
df['Sex']= df['Sex'].map({'male':0, 'female':1}) #map replaces 'male' with 0 and 'female' with 1

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [10]:
#Convert 'Embarked' (C,Q,S) into numbers
df['Embarked']= df['Embarked'].map({'C':0,'Q':1, 'S':2})

In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,2


Feature Scaling - Matching ranges
Real world values like Fare might be in hundreds while Age might be in 20s or 30s or 40s. We need to bring them to the same scale

In [15]:
from sklearn.preprocessing import StandardScaler
#Create a Scaler object
scaler = StandardScaler() #Standard Scaler makes values b/w -1 to 1

In [16]:
#Select features to scale
columns_to_scale=['Age', 'Fare']

In [19]:
#Apply scaler
df[columns_to_scale]= scaler.fit_transform(df[columns_to_scale])

In [20]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,-0.592481,1,0,A/5 21171,-0.502445,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.638789,1,0,PC 17599,0.786845,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",1,-0.284663,0,0,STON/O2. 3101282,-0.488854,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.407926,1,0,113803,0.42073,C123,2
4,5,0,3,"Allen, Mr. William Henry",0,0.407926,0,0,373450,-0.486337,,2
