Titanic Survival Prediction - Exploratory Data Analysis
========================================================
This script performs comprehensive EDA with visualizations.

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [54]:
# Set style for all plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

Load in the data

In [55]:
# Load data (handle UTF-16 encoding from sample files)
try:
    train_df = pd.read_csv('../data/raw/train.csv', encoding='utf-16')
except:
    # Fallback for standard CSV
    train_df = pd.read_csv('../data/raw/train.csv')

Dataset Overview

In [56]:
print(f"\nDataset Shape: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
print(f"\nColumn Names & Types:")
print(train_df.dtypes)

print(f"\n--- First 5 Rows ---")
print(train_df.head())

print(f"\n--- Statistical Summary (Numerical) ---")
print(train_df.describe())

print(f"\n--- Statistical Summary (Categorical) ---")
print(train_df.describe(include=['object']))


Dataset Shape: 891 rows, 12 columns

Column Names & Types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

--- First 5 Rows ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                         

Missing Value Analysis

In [57]:
missing = train_df.isnull().sum()
missing_pct = (missing / len(train_df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

print("\nMissing Values by Column:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualization: Missing Values
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#e74c3c' if x > 0 else '#2ecc71' for x in missing.values]
missing.plot(kind='bar', ax=ax, color=colors)
ax.set_title('Missing Values by Feature', fontsize=14, fontweight='bold')
ax.set_xlabel('Feature')
ax.set_ylabel('Count of Missing Values')
ax.axhline(y=0, color='black', linewidth=0.5)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/missing_by_feature.png')
plt.close()


Missing Values by Column:
          Missing Count  Missing %
Cabin               687      77.10
Age                 177      19.87
Embarked              2       0.22


Target Variable Analysis

In [58]:
survival_counts = train_df['Survived'].value_counts()
survival_pct = train_df['Survived'].value_counts(normalize=True) * 100

print(f"\nSurvival Distribution:")
print(f"  Did Not Survive (0): {survival_counts[0]} ({survival_pct[0]:.1f}%)")
print(f"  Survived (1):        {survival_counts[1]} ({survival_pct[1]:.1f}%)")
print(f"\nOverall Survival Rate: {survival_pct[1]:.1f}%")

# Visualization: Survival Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
colors = ['#e74c3c', '#2ecc71']
survival_counts.plot(kind='bar', ax=axes[0], color=colors)
axes[0].set_title('Survival Count', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Survived')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No (0)', 'Yes (1)'], rotation=0)
for i, v in enumerate(survival_counts.values):
    axes[0].text(i, v + 10, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(survival_counts, labels=['Did Not Survive', 'Survived'], 
            autopct='%1.1f%%', colors=colors, explode=(0.02, 0.02),
            shadow=True, startangle=90)
axes[1].set_title('Survival Proportion', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/survival_distribution.png')
plt.close()


Survival Distribution:
  Did Not Survive (0): 549 (61.6%)
  Survived (1):        342 (38.4%)

Overall Survival Rate: 38.4%
