# Titanic - EDA (Exploratory Data Analysis)

**Competition:** Titanic - Machine Learning from Disaster  
**Objective:** Predict survival (binary classification)  
**Metric:** Accuracy  
**Data:** 891 train / 418 test

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nSurvival rate: {train.Survived.mean():.3f}')

In [None]:
# Overview
print('=== TRAIN INFO ===')
print(train.dtypes)
print(f'\n=== SHAPE ===')
print(f'Train: {train.shape}, Test: {test.shape}')
train.head(10)

In [None]:
# Statistics
train.describe(include='all')

In [None]:
# Missing values analysis
def missing_analysis(df, name=''):
    missing = df.isnull().sum()
    missing_pct = 100 * df.isnull().sum() / len(df)
    table = pd.DataFrame({
        'Missing': missing,
        'Percent': missing_pct,
        'Type': df.dtypes
    })
    table = table[table['Missing'] > 0].sort_values('Percent', ascending=False)
    print(f'\n=== Missing Values ({name}) ===')
    print(table)
    return table

missing_train = missing_analysis(train, 'Train')
missing_test = missing_analysis(test, 'Test')

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

train['Survived'].value_counts().plot.bar(ax=axes[0], color=['#e74c3c', '#2ecc71'])
axes[0].set_title('Survival Count')
axes[0].set_xticklabels(['Died (0)', 'Survived (1)'], rotation=0)

train['Survived'].value_counts(normalize=True).plot.pie(
    ax=axes[1], autopct='%1.1f%%', colors=['#e74c3c', '#2ecc71'],
    labels=['Died', 'Survived']
)
axes[1].set_title('Survival Rate')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()
print(f'Survived: {train.Survived.sum()} ({train.Survived.mean()*100:.1f}%)')
print(f'Died: {(1-train.Survived).sum():.0f} ({(1-train.Survived.mean())*100:.1f}%)')

In [None]:
# Survival by Sex - THE most important feature
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.countplot(data=train, x='Sex', hue='Survived', ax=axes[0], palette=['#e74c3c', '#2ecc71'])
axes[0].set_title('Survival by Sex')

survival_by_sex = train.groupby('Sex')['Survived'].mean()
survival_by_sex.plot.bar(ax=axes[1], color=['#3498db', '#e91e63'])
axes[1].set_title('Survival Rate by Sex')
axes[1].set_ylabel('Survival Rate')
axes[1].set_xticklabels(['Female', 'Male'], rotation=0)

plt.tight_layout()
plt.show()
print(f'Female survival: {survival_by_sex["female"]:.3f}')
print(f'Male survival: {survival_by_sex["male"]:.3f}')

In [None]:
# Survival by Pclass
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.countplot(data=train, x='Pclass', hue='Survived', ax=axes[0], palette=['#e74c3c', '#2ecc71'])
axes[0].set_title('Survival by Pclass')

survival_by_class = train.groupby('Pclass')['Survived'].mean()
survival_by_class.plot.bar(ax=axes[1], color=['#f39c12', '#3498db', '#95a5a6'])
axes[1].set_title('Survival Rate by Pclass')
axes[1].set_ylabel('Survival Rate')
axes[1].set_xticklabels(['1st', '2nd', '3rd'], rotation=0)

plt.tight_layout()
plt.show()
print(survival_by_class)

In [None]:
# Survival by Sex + Pclass (key interaction)
pivot = train.pivot_table(values='Survived', index='Sex', columns='Pclass', aggfunc='mean')
print('Survival Rate by Sex x Pclass:')
print(pivot.round(3))

fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(pivot, annot=True, fmt='.2%', cmap='RdYlGn', ax=ax)
ax.set_title('Survival Rate: Sex x Pclass')
plt.show()

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

train['Age'].hist(bins=40, ax=axes[0], color='#3498db', alpha=0.7)
axes[0].set_title('Age Distribution')

train[train['Survived']==1]['Age'].hist(bins=40, ax=axes[1], color='#2ecc71', alpha=0.7, label='Survived')
train[train['Survived']==0]['Age'].hist(bins=40, ax=axes[1], color='#e74c3c', alpha=0.5, label='Died')
axes[1].legend()
axes[1].set_title('Age by Survival')

sns.boxplot(data=train, x='Survived', y='Age', ax=axes[2], palette=['#e74c3c', '#2ecc71'])
axes[2].set_title('Age Boxplot by Survival')

plt.tight_layout()
plt.show()

In [None]:
# Fare distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

train['Fare'].hist(bins=50, ax=axes[0], color='#3498db', alpha=0.7)
axes[0].set_title('Fare Distribution')

np.log1p(train['Fare']).hist(bins=50, ax=axes[1], color='#9b59b6', alpha=0.7)
axes[1].set_title('Log(Fare+1) Distribution')

plt.tight_layout()
plt.show()
print(f'Fare stats: mean={train.Fare.mean():.2f}, median={train.Fare.median():.2f}, max={train.Fare.max():.2f}')

In [None]:
# Embarked
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.countplot(data=train, x='Embarked', hue='Survived', ax=axes[0], palette=['#e74c3c', '#2ecc71'])
axes[0].set_title('Survival by Embarked')

survival_by_emb = train.groupby('Embarked')['Survived'].mean()
survival_by_emb.plot.bar(ax=axes[1], color=['#e74c3c', '#f39c12', '#3498db'])
axes[1].set_title('Survival Rate by Embarked')
axes[1].set_xticklabels(['C=Cherbourg', 'Q=Queenstown', 'S=Southampton'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Family Size analysis
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

sns.countplot(data=train, x='FamilySize', hue='Survived', ax=axes[0], palette=['#e74c3c', '#2ecc71'])
axes[0].set_title('Survival by Family Size')

survival_by_fam = train.groupby('FamilySize')['Survived'].mean()
survival_by_fam.plot.bar(ax=axes[1], color='#3498db')
axes[1].set_title('Survival Rate by Family Size')
axes[1].set_ylabel('Survival Rate')

plt.tight_layout()
plt.show()
print('Key insight: Family size 2-4 has highest survival rate')

In [None]:
# Name / Title analysis
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
print('Title counts:')
print(train['Title'].value_counts())
print(f'\nSurvival by Title:')
print(train.groupby('Title')['Survived'].agg(['mean', 'count']).sort_values('count', ascending=False).head(10))

In [None]:
# Cabin analysis
train['HasCabin'] = train['Cabin'].notna().astype(int)
train['CabinLetter'] = train['Cabin'].str[0]

print(f'Has Cabin survival rate: {train[train.HasCabin==1].Survived.mean():.3f}')
print(f'No Cabin survival rate: {train[train.HasCabin==0].Survived.mean():.3f}')

fig, ax = plt.subplots(figsize=(10, 4))
survival_by_cabin = train.groupby('CabinLetter')['Survived'].mean().sort_values(ascending=False)
survival_by_cabin.plot.bar(ax=ax, color='#3498db')
ax.set_title('Survival Rate by Cabin Letter')
plt.show()

In [None]:
# Correlation matrix
num_cols = train.select_dtypes(include=[np.number]).columns
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(train[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()

## Key Findings

1. **Sex** is the strongest predictor: 74% female vs 19% male survival
2. **Pclass** matters: 1st class 63%, 2nd 47%, 3rd 24%
3. **Sex x Pclass interaction**: Nearly all 1st/2nd class women survived
4. **Age**: Children (<10) have higher survival, especially boys
5. **Family Size**: 2-4 optimal, alone or large families die more
6. **Title**: Mrs/Miss high, Mr low, Master (boys) medium
7. **Cabin**: Having cabin info = higher class = higher survival
8. **Fare**: Higher fare = better class = higher survival
9. **Embarked**: Cherbourg highest (more 1st class passengers)

### Missing Values Strategy
- **Age** (177 missing): Impute by Title median
- **Cabin** (687 missing): Use HasCabin flag + CabinLetter
- **Embarked** (2 missing): Fill with mode ('S')