In [1]:
!pip install pandas 
!pip install numpy
!pip install seaborn



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
pip install --upgrade certifi

Note: you may need to restart the kernel to use updated packages.


In [4]:
import ssl
import urllib.request

ssl._create_default_https_context = ssl._create_unverified_context


## Titanic Dataset

In [5]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Missing Values in the Titanic dataset 

In [6]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


### Dropping Columns

In [7]:
columns_to_drop = ['deck', 'who', 'adult_male']  # List of column names to drop
df.drop(columns_to_drop, axis=1, inplace=True)

### Removing Null Rows 

In [8]:
df.dropna(subset=['fare', 'age'], inplace=True)
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
embark_town    2
alive          0
alone          0
dtype: int64


### Duplicated Values

In [9]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alive,alone
133,1,2,female,29.0,1,0,26.0,S,Second,Southampton,yes,False
173,0,3,male,21.0,0,0,7.925,S,Third,Southampton,no,True
213,0,2,male,30.0,0,0,13.0,S,Second,Southampton,no,True
313,0,3,male,28.0,0,0,7.8958,S,Third,Southampton,no,True
320,0,3,male,22.0,0,0,7.25,S,Third,Southampton,no,True
343,0,2,male,25.0,0,0,13.0,S,Second,Southampton,no,True
355,0,3,male,28.0,0,0,9.5,S,Third,Southampton,no,True
387,1,2,female,36.0,0,0,13.0,S,Second,Southampton,yes,True
418,0,2,male,30.0,0,0,13.0,S,Second,Southampton,no,True
476,0,2,male,34.0,1,0,21.0,S,Second,Southampton,no,False


### Dropping Duplicate Values

In [10]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,Second,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,Cherbourg,yes,True


### Renaming Columns

In [11]:
df.rename(columns={'sibsp': 'Siblings/Spouses Onboard', 'parch': 'Parents/Children Onboard'}, inplace=True)
df

Unnamed: 0,survived,pclass,sex,age,Siblings/Spouses Onboard,Parents/Children Onboard,fare,embarked,class,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,Second,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,Cherbourg,yes,True


### Formatting Fare

In [12]:
df['fare'] = df['fare'].round(2)
df

Unnamed: 0,survived,pclass,sex,age,Siblings/Spouses Onboard,Parents/Children Onboard,fare,embarked,class,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,no,False
1,1,1,female,38.0,1,0,71.28,C,First,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.92,S,Third,Southampton,yes,True
3,1,1,female,35.0,1,0,53.10,S,First,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.50,S,Second,Southampton,no,True
885,0,3,female,39.0,0,5,29.12,Q,Third,Queenstown,no,False
887,1,1,female,19.0,0,0,30.00,S,First,Southampton,yes,True
889,1,1,male,26.0,0,0,30.00,C,First,Cherbourg,yes,True


### Descriptive Stats

In [13]:
stats = df.describe()
stats_rounded = stats.round(2)
stats_rounded

Unnamed: 0,survived,pclass,age,Siblings/Spouses Onboard,Parents/Children Onboard,fare
count,676.0,676.0,676.0,676.0,676.0,676.0
mean,0.42,2.22,29.81,0.54,0.45,35.85
std,0.49,0.85,14.73,0.95,0.87,54.07
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,16.1
75%,1.0,3.0,39.0,1.0,1.0,35.5
max,1.0,3.0,80.0,5.0,6.0,512.33


### Average Fare and Age per Class

In [16]:
average_stats = df.groupby('class', observed=False)[['fare', 'age']].mean()
average_stats

Unnamed: 0_level_0,fare,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,88.396793,38.18163
Second,22.168228,29.859684
Third,13.377665,25.18015


### New Column added to the DataFrame

In [17]:
df['fare_percentage'] = (df['fare'] / df['fare'].sum()) * 100
df

Unnamed: 0,survived,pclass,sex,age,Siblings/Spouses Onboard,Parents/Children Onboard,fare,embarked,class,embark_town,alive,alone,fare_percentage
0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,no,False,0.029915
1,1,1,female,38.0,1,0,71.28,C,First,Cherbourg,yes,False,0.294111
2,1,3,female,26.0,0,0,7.92,S,Third,Southampton,yes,True,0.032679
3,1,1,female,35.0,1,0,53.10,S,First,Southampton,yes,False,0.219098
4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,no,True,0.033215
...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.50,S,Second,Southampton,no,True,0.043324
885,0,3,female,39.0,0,5,29.12,Q,Third,Queenstown,no,False,0.120153
887,1,1,female,19.0,0,0,30.00,S,First,Southampton,yes,True,0.123784
889,1,1,male,26.0,0,0,30.00,C,First,Cherbourg,yes,True,0.123784


### Survivors by Gender

In [18]:
survivors_gender = df.groupby('sex')['survived'].sum()
survivors_gender

sex
female    191
male       92
Name: survived, dtype: int64