`Import Required Libraries`

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

`Check Library Versions`

In [None]:
print("Pandas version: ",pd.__version__)
print("Numpy version: ",np.__version__)
print("Seaborn version: ",sns.__version__)

Pandas version:  2.2.2
Numpy version:  2.0.2
Seaborn version:  0.13.2


`Load titanic dataset`

In [None]:
df = sns.load_dataset('titanic')

`first 10 rows`

In [None]:
print(df.head(10))

   survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0         0       3    male  22.0      1      0   7.2500        S   Third   
1         1       1  female  38.0      1      0  71.2833        C   First   
2         1       3  female  26.0      0      0   7.9250        S   Third   
3         1       1  female  35.0      1      0  53.1000        S   First   
4         0       3    male  35.0      0      0   8.0500        S   Third   
5         0       3    male   NaN      0      0   8.4583        Q   Third   
6         0       1    male  54.0      0      0  51.8625        S   First   
7         0       3    male   2.0      3      1  21.0750        S   Third   
8         1       3  female  27.0      0      2  11.1333        S   Third   
9         1       2  female  14.0      1      0  30.0708        C  Second   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes

`Shape of dataset`

In [None]:
print("Rows, Columns:", df.shape)

Rows, Columns: (891, 15)


`columns names`

In [None]:
print(df.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


`Data info`

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


`Summary Statistics`

In [None]:
print(df.describe())

         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


`Gender Distribution`

In [None]:
print(df['sex'].value_counts())

sex
male      577
female    314
Name: count, dtype: int64


`Missing Values`

In [None]:
print(df.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


`Fill Missing Age with Mean`

In [None]:
df['age'].fillna(df['age'].mean())

Unnamed: 0,age
0,22.000000
1,38.000000
2,26.000000
3,35.000000
4,35.000000
...,...
886,27.000000
887,19.000000
888,29.699118
889,26.000000


`Replace Missing Deck Values`

In [None]:
if not df['deck'].mode().empty:
  df["deck"] = df['deck'].fillna(df['deck'].mode()[0])
else:
  df["deck"] = df['deck'].fillna('Unknown')

`Drop Rows with Missing Embarked`

In [None]:
df.dropna(subset=['embarked'], inplace=True)

`Remove Duplicates`

In [None]:
print('Duplicates:', df.duplicated().sum())
df.drop_duplicates(inplace=True)

Duplicates: 108


`Overall Survival Rate`

In [None]:
print('Survival Rate:', df['survived'].mean())

Survival Rate: 0.41101152368758004


`Survival by Gender`

In [None]:
print(df.groupby('sex')['survived'].mean())

sex
female    0.738832
male      0.216327
Name: survived, dtype: float64


`Survival by Passenger Class`

In [None]:
print(df.groupby('class')['survived'].mean())

class
First     0.630332
Second    0.509091
Third     0.256790
Name: survived, dtype: float64


  print(df.groupby('class')['survived'].mean())


`Create Child Column`

In [None]:
df['child'] = df['age'].apply(lambda x: 1 if x < 18 else 0)

`Survival of Children vs Adults`

In [None]:
print(df.groupby('child')['survived'].mean())

child
0    0.388972
1    0.545455
Name: survived, dtype: float64


`Save Cleaned Dataset`

In [None]:
df.to_csv('titanic_clean.csv', index=False)
print('Cleaned dataset saved as titanic_clean.csv')

Cleaned dataset saved as titanic_clean.csv


`Reload Cleaned Dataset`

In [None]:
new_df = pd.read_csv('titanic_clean.csv')
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,child
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,C,Southampton,no,False,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,C,Southampton,no,True,0


# `average age of survivors vs. non-survivors`

In [None]:
average=new_df.groupby('survived')['age'].mean()
print("Age of Survivors: ",average[1])
print("Age of Non survivors: ",average[0])

Age of Survivors:  28.212482269503546
Age of Non survivors:  30.874045801526716


# `Which embarkation port had the highest survival rate?`

In [None]:
SR_by_port = new_df.groupby('embark_town')['survived'].mean()
print(SR_by_port)
print("embarkation port with the highest survival:")
print(SR_by_port.idxmax())

embark_town
Cherbourg      0.580645
Queenstown     0.338983
Southampton    0.372134
Name: survived, dtype: float64
embarkation port with the highest survival:
Cherbourg


# `Which gender had the highest survival rate within 3rd class passengers?`

In [None]:
third_class_df = new_df[new_df['pclass'] == 3]

SR_by_gender = third_class_df.groupby('sex')['survived'].mean()
# Print the results
print("Survival Rate by Gender in 3rd Class:\n",SR_by_gender)
print("\nHighest survival rate in 3rd class: ",SR_by_gender.idxmax())


Survival Rate by Gender in 3rd Class:
 sex
female    0.472441
male      0.158273
Name: survived, dtype: float64

Highest survival rate in 3rd class:  female


# `Which passenger class had the highest percentage of children?`

In [None]:
new_df["is_child"]=new_df["age"]<18
highest_precentage_of_child=new_df.groupby("class")["is_child"].mean()*100
print(highest_precentage_of_child)
print(f"\nClass with the highest percentage of children: ")
print(highest_precentage_of_child.idxmax())

class
First      5.687204
Second    13.939394
Third     18.518519
Name: is_child, dtype: float64

Class with the highest percentage of children: 
Third


# `Survival by family size`

In [None]:
new_df['family_size'] = new_df['sibsp'] + new_df['parch'] + 1
print(new_df.groupby('family_size')['survived'].mean())

family_size
1     0.334086
2     0.551948
3     0.574257
4     0.714286
5     0.230769
6     0.136364
7     0.333333
8     0.000000
11    0.000000
Name: survived, dtype: float64
