# EDA â€” Titanic Dataset

Exploratory Data Analysis: load data, inspect missing values, and visualize survival rates by `Sex`, `Pclass`, and `Age`. Also show a correlation heatmap for numeric features.

In [None]:
# Imports and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
%matplotlib inline

In [None]:
# Load the dataset
csv_path = '../data/raw/Titanic-Dataset.csv'
df = pd.read_csv(csv_path)

# Quick preview
df.head()

In [None]:
# Dataset info and missing values
print('Shape:', df.shape)
print('
Info:')
display(df.info())

# Missing values by column
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]

In [None]:
# Survival rate by Sex
plt.figure(figsize=(6,4))
sns.barplot(x='Sex', y='Survived', data=df, ci=None)
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Sex')
plt.ylim(0,1)
plt.show()

In [None]:
# Survival rate by Pclass
plt.figure(figsize=(6,4))
sns.barplot(x='Pclass', y='Survived', data=df, ci=None, order=sorted(df['Pclass'].dropna().unique()))
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Passenger Class (Pclass)')
plt.ylim(0,1)
plt.show()

In [None]:
# Survival rate across Age bins
# Create age bins (handle missing ages)
age_bins = list(range(0, 91, 10))
df['AgeBin'] = pd.cut(df['Age'], bins=age_bins)
age_rate = df.groupby('AgeBin')['Survived'].mean().reset_index()
plt.figure(figsize=(10,4))
sns.barplot(x='AgeBin', y='Survived', data=age_rate, color='steelblue')
plt.xticks(rotation=45)
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Age Bin')
plt.ylim(0,1)
plt.show()

In [None]:
# Correlation heatmap for numeric columns
num_cols = df.select_dtypes(include=[np.number])
corr = num_cols.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap (numeric features)')
plt.show()

**Notes:**
- `Survived` is treated as 0/1.
- `Age` may contain missing values; bins include NaNs as an additional category in the dataframe but are excluded from the grouped plot when NaN.
- You can further handle missing values (imputation) or refine plots (e.g., KDE) as next steps.