# Titanic EDA
Exploratory Data Analysis of the Titanic dataset

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style('whitegrid')
%matplotlib inline

# Load cleaned data
df = pd.read_csv('../../data/processed/cleaned_titanic.csv')
df.head()

## 1. Survival by Gender

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex', hue='Survived', data=df, order=['male', 'female'])
plt.title('Survival Count by Gender')
plt.xlabel('Sex')
plt.legend(title='Survived', labels=['Did Not Survive', 'Survived'])
plt.show()

**Insight:** The plot shows a clear disparity by sex: females have a noticeably higher survival rate than males. This suggests `Sex` is a strong predictor and should be retained as a key feature in modeling.

## 2. Survival by Passenger Class

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Pclass', hue='Survived', data=df)
plt.title('Survival Count by Passenger Class')
plt.legend(['Did Not Survive', 'Survived'])
plt.show()

**Insight:** Higher passenger class corresponds to higher survival — first-class passengers show the largest share of survivors. This reflects socio-economic effects and should be modeled carefully (e.g., with `Pclass` or derived features).

## 3. Age Distribution by Survival

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Age', hue='Survived', bins=30, kde=True, multiple='stack')
plt.title('Age Distribution by Survival')
plt.legend(['Did Not Survive', 'Survived'])
plt.show()

**Insight:** The age distribution shows survivors and non-survivors span wide ranges, though certain age groups (children and younger adults) show slightly higher survival — careful handling of age (imputation and possibly age groups) is recommended.

## 4. Correlation Heatmap

In [None]:
# 4. Correlation Heatmap
plt.figure(figsize=(12, 8))

# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Check if we have any numeric columns left
if len(numeric_df.columns) > 0:
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Correlation Heatmap (Numeric Features Only)')
    plt.tight_layout()
    plt.show()
    
    # Show non-numeric columns that were excluded
    non_numeric = df.select_dtypes(exclude=['int64', 'float64']).columns
    if len(non_numeric) > 0:
        print("Note: The following non-numeric columns were excluded from the correlation analysis:")
        print(list(non_numeric))
else:
    print("No numeric columns available for correlation analysis.")

**Insight:** The correlation heatmap highlights relationships among numeric features. For example, `Fare` correlates with `Pclass` (negative correlation) and `FamilySize` may relate to survival; these relationships help guide feature engineering.

## 5. Fare Distribution by Survival

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Survived', y='Fare', data=df[df['Fare'] < 300])
plt.title('Fare Distribution by Survival (Fare < $300)')
plt.xticks([0, 1], ['Did Not Survive', 'Survived'])
plt.show()