In [2]:
# step1 import pandas
import pandas as pd

In [6]:
# load the dataset
dataset = pd.read_csv("Datasets/Titanic-Dataset.csv")

In [8]:
# review top 5 data from datasets
dataset.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# checking missing values 
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
# Hnadling missing values for 'AGE'
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())

In [28]:
# dropping column 'Cabin' as it contains too many nan values
dataset.drop(columns =['Cabin'], inplace = True)

In [30]:
# handling missing values for 'Embarked'
dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])

In [32]:
# checking again for null values
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [34]:
# checking the overview of the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [36]:
# fix data types
dataset['Pclass'] = dataset['Pclass'].astype('category')
dataset['Survived'] = dataset['Survived'].astype('category')

In [38]:
# Rename Columns (Clean Style)
dataset.columns = dataset.columns.str.lower().str.replace(' ', '-')

In [40]:
# Remove Duplicates
dataset.drop_duplicates(inplace = True)

In [42]:
#Final Validation
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   passengerid  891 non-null    int64   
 1   survived     891 non-null    category
 2   pclass       891 non-null    category
 3   name         891 non-null    object  
 4   sex          891 non-null    object  
 5   age          891 non-null    float64 
 6   sibsp        891 non-null    int64   
 7   parch        891 non-null    int64   
 8   ticket       891 non-null    object  
 9   fare         891 non-null    float64 
 10  embarked     891 non-null    object  
dtypes: category(2), float64(2), int64(3), object(4)
memory usage: 64.8+ KB


In [44]:
dataset.isnull().sum()

passengerid    0
survived       0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           0
embarked       0
dtype: int64

In [46]:
dataset.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [48]:
# Mini Challenges
'''
Create a new column age_group:

Child (<18)

Adult (18–60)

Senior (>60)

Check survival rate by sex.

Save the cleaned dataset:
'''

'\nCreate a new column age_group:\n\nChild (<18)\n\nAdult (18–60)\n\nSenior (>60)\n\nCheck survival rate by sex.\n\nSave the cleaned dataset:\n'

In [58]:
# creating function for this 
def age_group(age):
    if age < 18:
        return 'Child'
    elif age < 60:
        return 'Adult'
    else:
        return 'Senior'

dataset['age_group'] = dataset['age'].apply(age_group)

In [66]:
# reviewing 
dataset[['age', 'age_group']].loc[20:25]

Unnamed: 0,age,age_group
20,35.0,Adult
21,34.0,Adult
22,15.0,Child
23,28.0,Adult
24,8.0,Child
25,38.0,Adult


In [74]:
# checking survival rate by sex
dataset.groupby('sex')['survived'].value_counts(normalize=True)

sex     survived
female  1           0.742038
        0           0.257962
male    0           0.811092
        1           0.188908
Name: proportion, dtype: float64

In [78]:
#Save the cleaned dataset:
dataset.to_csv('Datasets/Cleaned_dataset_titanic.csv', index = False)