In [None]:
# Data Exploration Notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

# Display first few rows
print("Training Data:")
display(train_df.head())
print("\nTest Data:")
display(test_df.head())

# Basic info
print("\nTraining Data Info:")
print(train_df.info())
print("\nTest Data Info:")
print(test_df.info())

# Check for missing values
print("\nMissing Values in Training Data:")
print(train_df.isnull().sum())
print("\nMissing Values in Test Data:")
print(test_df.isnull().sum())

# Statistical summary
print("\nStatistical Summary:")
print(train_df.describe())

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Survival by class
sns.countplot(x='Pclass', hue='Survived', data=train_df, ax=axes[0, 0])
axes[0, 0].set_title('Survival by Passenger Class')

# Survival by gender
sns.countplot(x='Sex', hue='Survived', data=train_df, ax=axes[0, 1])
axes[0, 1].set_title('Survival by Gender')

# Age distribution
sns.histplot(data=train_df, x='Age', hue='Survived', kde=True, ax=axes[0, 2])
axes[0, 2].set_title('Age Distribution by Survival')

# Fare distribution
sns.histplot(data=train_df, x='Fare', hue='Survived', kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Fare Distribution by Survival')

# Embarkment port
sns.countplot(x='Embarked', hue='Survived', data=train_df, ax=axes[1, 1])
axes[1, 1].set_title('Survival by Embarkment Port')

# Family size
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
sns.countplot(x='FamilySize', hue='Survived', data=train_df, ax=axes[1, 2])
axes[1, 2].set_title('Survival by Family Size')

plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation = train_df.corr(numeric_only=True)
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.show()
