# 🚀 Data Analysis Portfolio Project: Titanic Survival Analysis

## 📌 Overview
This project analyzes the famous Titanic dataset to uncover patterns in passenger survival. We'll explore demographic factors, socio-economic status, and travel arrangements that influenced survival rates.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings

filterwarnings('ignore')
sns.set_style('whitegrid')
%matplotlib inline

# Load dataset
df = sns.load_dataset('titanic')

In [None]:
# Display first 5 rows
df.head()

In [None]:
# Dataset information
df.info()

In [None]:
# Basic statistics
df.describe(include='all').T

In [None]:
# Check missing values
missing = df.isna().sum().sort_values(ascending=False)
missing_percent = (missing/len(df))*100
pd.concat([missing, missing_percent], axis=1, keys=['Missing Values', '% Missing'])

In [None]:
# Handle missing values
# Drop deck column (too many missing)
df.drop('deck', axis=1, inplace=True)

# Fill age with median
df['age'] = df['age'].fillna(df['age'].median())

# Drop remaining rows with missing values (embarked: 2 rows)
df.dropna(inplace=True)

# Confirm missing values handled
df.isna().sum().sum()

In [None]:
# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

In [None]:
# Survival distribution
plt.figure(figsize=(8,5))
sns.countplot(x='survived', data=df, palette='viridis')
plt.title('Survival Distribution (0 = Died, 1 = Survived)')
plt.show()

print(f"Survival Rate: {df['survived'].mean():.2%}")

In [None]:
# Gender distribution
plt.figure(figsize=(8,5))
sns.countplot(x='sex', data=df, palette='coolwarm')
plt.title('Passenger Gender Distribution')
plt.show()

In [None]:
# Age distribution
plt.figure(figsize=(10,6))
sns.histplot(df['age'], bins=30, kde=True, color='purple')
plt.title('Age Distribution of Passengers')
plt.xlabel('Age')
plt.show()

In [None]:
# Survival by gender
plt.figure(figsize=(8,5))
sns.countplot(x='sex', hue='survived', data=df, palette='pastel')
plt.title('Survival Count by Gender')
plt.show()

In [None]:
# Survival by passenger class
plt.figure(figsize=(8,5))
sns.barplot(x='pclass', y='survived', data=df, palette='Set2')
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.xlabel('Passenger Class')
plt.show()

In [None]:
# Age distribution by survival
plt.figure(figsize=(10,6))
sns.boxplot(x='survived', y='age', data=df, palette='Set3')
plt.title('Age Distribution by Survival Status')
plt.xticks([0,1], ['Died', 'Survived'])
plt.show()

In [None]:
# Convert categorical variables
df_encoded = pd.get_dummies(df[['survived', 'pclass', 'sex', 'age', 'fare', 'alone']])

In [None]:
# Correlation matrix
plt.figure(figsize=(12,8))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Survival rate by class and gender
pd.pivot_table(df, index='sex', columns='pclass', values='survived', aggfunc='mean').style.background_gradient(cmap='YlGnBu')

In [None]:
# Fare distribution by survival and class
plt.figure(figsize=(12,7))
sns.violinplot(x='pclass', y='fare', hue='survived', data=df, split=True, palette='magma')
plt.title('Fare Distribution by Class and Survival')
plt.yscale('log')
plt.legend(title='Survived', loc='upper right')
plt.show()

## Key Insights
- Overall Survival Rate: Only 38.39% of passengers survived
- Gender Impact: 74.20% of female passengers survived vs 18.89% of males
- Class Mattered: 1st class had 62.96% survival vs 24.24% in 3rd class
- Age Factor: Children under 10 had higher survival rates
- Fare Correlation: Higher fare payers had better survival chances (r=0.26)

## 📝 Conclusion
The analysis reveals significant disparities in survival rates based on:

- Gender ("Women and children first" policy evident)
- Passenger class (Wealthier passengers had priority)
- Age (Young children received priority)
- Fare amount (Higher fares correlated with survival)

This analysis demonstrates fundamental data science skills including data cleaning, exploratory analysis, visualization, and statistical interpretation.

In [None]:
# Save cleaned dataset
#df.to_csv('cleaned_titanic.csv', index=False)