# 🚢 Titanic Survival Prediction - EDA Analysis

## 📊 Mục tiêu
- Khám phá và hiểu rõ dataset Titanic
- Phân tích missing values và outliers
- Tìm hiểu patterns trong survival data
- Chuẩn bị dữ liệu cho feature engineering

## 📋 Nội dung
1. **Data Loading & Overview**
2. **Missing Values Analysis**
3. **Survival Patterns Exploration**
4. **Feature Distribution Analysis**
5. **Correlation Analysis**
6. **Outlier Detection**
7. **Key Insights & Next Steps**


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📥 Data Loading & Overview


In [None]:
# Load datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("🚢 Titanic Dataset Loaded Successfully!")
print(f"📊 Training set shape: {train_df.shape}")
print(f"📊 Test set shape: {test_df.shape}")

# Display first few rows
print("\n🔍 Training Data Preview:")
display(train_df.head())

print("\n🔍 Test Data Preview:")
display(test_df.head())


In [None]:
# Dataset information
print("📋 Training Dataset Info:")
print("=" * 50)
train_df.info()

print("\n📋 Test Dataset Info:")
print("=" * 50)
test_df.info()

# Basic statistics
print("\n📊 Training Dataset Statistics:")
print("=" * 50)
display(train_df.describe())

print("\n📊 Test Dataset Statistics:")
print("=" * 50)
display(test_df.describe())


## 2. 🔍 Missing Values Analysis


In [None]:
# Missing values analysis
def analyze_missing_values(df, dataset_name):
    """Analyze missing values in dataset"""
    print(f"🔍 Missing Values Analysis - {dataset_name}")
    print("=" * 60)
    
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing Percentage': missing_percent
    })
    
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    
    if len(missing_df) > 0:
        display(missing_df)
        
        # Visualization
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        missing_df['Missing Count'].plot(kind='bar')
        plt.title(f'Missing Values Count - {dataset_name}')
        plt.xticks(rotation=45)
        
        plt.subplot(1, 2, 2)
        missing_df['Missing Percentage'].plot(kind='bar', color='orange')
        plt.title(f'Missing Values Percentage - {dataset_name}')
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
    else:
        print("✅ No missing values found!")
    
    return missing_df

# Analyze both datasets
train_missing = analyze_missing_values(train_df, "Training Set")
test_missing = analyze_missing_values(test_df, "Test Set")


## 4. 📊 Feature Distribution Analysis


In [None]:
# Feature distribution analysis
plt.figure(figsize=(20, 15))

# 1. Age distribution
plt.subplot(3, 4, 1)
plt.hist(train_df['Age'].dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# 2. Fare distribution
plt.subplot(3, 4, 2)
plt.hist(train_df['Fare'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')

# 3. SibSp distribution
plt.subplot(3, 4, 3)
sns.countplot(data=train_df, x='SibSp')
plt.title('Siblings/Spouses Distribution')
plt.xticks(rotation=45)

# 4. Parch distribution
plt.subplot(3, 4, 4)
sns.countplot(data=train_df, x='Parch')
plt.title('Parents/Children Distribution')
plt.xticks(rotation=45)

# 5. Age vs Survival
plt.subplot(3, 4, 5)
sns.boxplot(data=train_df, x='Survived', y='Age')
plt.title('Age vs Survival')
plt.xticks([0, 1], ['Died', 'Survived'])

# 6. Fare vs Survival
plt.subplot(3, 4, 6)
sns.boxplot(data=train_df, x='Survived', y='Fare')
plt.title('Fare vs Survival')
plt.xticks([0, 1], ['Died', 'Survived'])

# 7. Family Size vs Survival
plt.subplot(3, 4, 7)
sns.boxplot(data=train_df, x='Survived', y='FamilySize')
plt.title('Family Size vs Survival')
plt.xticks([0, 1], ['Died', 'Survived'])

# 8. Age distribution by survival
plt.subplot(3, 4, 8)
for survived in [0, 1]:
    plt.hist(train_df[train_df['Survived'] == survived]['Age'].dropna(), 
             alpha=0.6, label=f'Survived: {survived}', bins=20)
plt.title('Age Distribution by Survival')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()

# 9. Fare distribution by survival
plt.subplot(3, 4, 9)
for survived in [0, 1]:
    plt.hist(train_df[train_df['Survived'] == survived]['Fare'], 
             alpha=0.6, label=f'Survived: {survived}', bins=20)
plt.title('Fare Distribution by Survival')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.legend()

# 10. Survival rate by age groups
plt.subplot(3, 4, 10)
age_survival = train_df.groupby('AgeGroup')['Survived'].mean()
age_survival.plot(kind='bar', color='coral')
plt.title('Survival Rate by Age Group')
plt.ylabel('Survival Rate')
plt.xticks(rotation=45)

# 11. Survival rate by family size
plt.subplot(3, 4, 11)
family_survival = train_df.groupby('FamilySize')['Survived'].mean()
family_survival.plot(kind='bar', color='lightblue')
plt.title('Survival Rate by Family Size')
plt.ylabel('Survival Rate')
plt.xticks(rotation=45)

# 12. Survival rate by class and gender
plt.subplot(3, 4, 12)
class_sex_survival = train_df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack()
class_sex_survival.plot(kind='bar', ax=plt.gca())
plt.title('Survival Rate by Class and Gender')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)
plt.legend(title='Gender')

plt.tight_layout()
plt.show()


## 5. 🔗 Correlation Analysis


In [None]:
# Correlation analysis
# Prepare data for correlation
correlation_data = train_df.copy()

# Convert categorical variables to numeric
correlation_data['Sex'] = correlation_data['Sex'].map({'male': 0, 'female': 1})
correlation_data['Embarked'] = correlation_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Create correlation matrix
correlation_matrix = correlation_data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize']].corr()

# Visualization
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - Titanic Dataset')
plt.tight_layout()
plt.show()

# Print correlation with survival
print("🔗 Correlation with Survival:")
print("=" * 40)
survival_corr = correlation_matrix['Survived'].drop('Survived').sort_values(key=abs, ascending=False)
for feature, corr in survival_corr.items():
    print(f"{feature:12}: {corr:6.3f}")

# Feature importance analysis
print("\n📊 Feature Importance Analysis:")
print("=" * 40)
print("Positive correlation (higher value = higher survival chance):")
positive_features = survival_corr[survival_corr > 0]
for feature, corr in positive_features.items():
    print(f"  {feature:12}: {corr:6.3f}")

print("\nNegative correlation (higher value = lower survival chance):")
negative_features = survival_corr[survival_corr < 0]
for feature, corr in negative_features.items():
    print(f"  {feature:12}: {corr:6.3f}")


## 6. 🎯 Outlier Detection


In [None]:
# Outlier detection
def detect_outliers(df, column):
    """Detect outliers using IQR method"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Analyze outliers for numerical features
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
outlier_summary = {}

plt.figure(figsize=(15, 10))

for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 3, i)
    
    # Box plot
    sns.boxplot(data=train_df, y=feature)
    plt.title(f'{feature} - Outlier Detection')
    
    # Detect outliers
    outliers, lower, upper = detect_outliers(train_df, feature)
    outlier_summary[feature] = {
        'count': len(outliers),
        'percentage': len(outliers) / len(train_df) * 100,
        'lower_bound': lower,
        'upper_bound': upper
    }
    
    print(f"🎯 {feature} Outliers:")
    print(f"   Count: {len(outliers)} ({len(outliers)/len(train_df)*100:.1f}%)")
    print(f"   Range: [{lower:.2f}, {upper:.2f}]")
    if len(outliers) > 0:
        print(f"   Min outlier: {outliers[feature].min():.2f}")
        print(f"   Max outlier: {outliers[feature].max():.2f}")
    print()

plt.tight_layout()
plt.show()

# Summary table
outlier_df = pd.DataFrame(outlier_summary).T
outlier_df = outlier_df.sort_values('count', ascending=False)
print("📊 Outlier Summary:")
print("=" * 50)
display(outlier_df)


## 7. 🔍 Key Insights & Next Steps


In [None]:
# Key insights summary
print("🔍 KEY INSIGHTS FROM EDA:")
print("=" * 50)

print("\n📊 SURVIVAL PATTERNS:")
print("-" * 30)
print(f"• Overall survival rate: {survival_rate:.1%}")
print(f"• Female survival rate: {train_df[train_df['Sex']=='female']['Survived'].mean():.1%}")
print(f"• Male survival rate: {train_df[train_df['Sex']=='male']['Survived'].mean():.1%}")
print(f"• 1st class survival rate: {train_df[train_df['Pclass']==1]['Survived'].mean():.1%}")
print(f"• 2nd class survival rate: {train_df[train_df['Pclass']==2]['Survived'].mean():.1%}")
print(f"• 3rd class survival rate: {train_df[train_df['Pclass']==3]['Survived'].mean():.1%}")

print("\n🔍 MISSING VALUES:")
print("-" * 30)
print(f"• Age: {train_df['Age'].isnull().sum()} missing ({train_df['Age'].isnull().sum()/len(train_df)*100:.1f}%)")
print(f"• Cabin: {train_df['Cabin'].isnull().sum()} missing ({train_df['Cabin'].isnull().sum()/len(train_df)*100:.1f}%)")
print(f"• Embarked: {train_df['Embarked'].isnull().sum()} missing ({train_df['Embarked'].isnull().sum()/len(train_df)*100:.1f}%)")

print("\n🎯 FEATURE CORRELATIONS WITH SURVIVAL:")
print("-" * 30)
for feature, corr in survival_corr.head(5).items():
    direction = "📈" if corr > 0 else "📉"
    print(f"• {feature:12}: {corr:6.3f} {direction}")

print("\n🚨 OUTLIERS DETECTED:")
print("-" * 30)
for feature, info in outlier_summary.items():
    if info['count'] > 0:
        print(f"• {feature:12}: {info['count']:3d} outliers ({info['percentage']:4.1f}%)")

print("\n📋 NEXT STEPS:")
print("-" * 30)
print("1. 🔧 Feature Engineering:")
print("   • Extract title from Name")
print("   • Create family size features")
print("   • Handle missing values in Age, Cabin, Embarked")
print("   • Create age groups and fare bins")
print("   • Extract cabin deck information")

print("\n2. 🤖 Model Training:")
print("   • Train baseline models (Logistic Regression, Random Forest)")
print("   • Try advanced models (XGBoost, Neural Networks)")
print("   • Implement cross-validation")

print("\n3. 🎯 Model Optimization:")
print("   • Hyperparameter tuning")
print("   • Feature selection")
print("   • Ensemble methods")

print("\n4. 📊 Model Evaluation:")
print("   • Performance metrics")
print("   • Feature importance analysis")
print("   • Model comparison")

print("\n✅ EDA Analysis Complete!")
print("🚀 Ready to proceed to Feature Engineering!")


## 3. 🎯 Survival Patterns Exploration


In [None]:
# Overall survival rate
survival_rate = train_df['Survived'].mean()
print(f"🎯 Overall Survival Rate: {survival_rate:.2%}")
print(f"💀 Total Passengers: {len(train_df)}")
print(f"✅ Survived: {train_df['Survived'].sum()}")
print(f"❌ Died: {len(train_df) - train_df['Survived'].sum()}")

# Survival distribution
plt.figure(figsize=(15, 10))

# 1. Overall survival
plt.subplot(2, 3, 1)
survival_counts = train_df['Survived'].value_counts()
plt.pie(survival_counts.values, labels=['Died', 'Survived'], autopct='%1.1f%%', 
        colors=['#ff6b6b', '#4ecdc4'])
plt.title('Overall Survival Distribution')

# 2. Survival by Gender
plt.subplot(2, 3, 2)
survival_by_sex = train_df.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_sex.columns = ['Total', 'Survived', 'Survival_Rate']
print("\n👥 Survival by Gender:")
display(survival_by_sex)

sns.countplot(data=train_df, x='Sex', hue='Survived')
plt.title('Survival by Gender')
plt.legend(['Died', 'Survived'])

# 3. Survival by Class
plt.subplot(2, 3, 3)
survival_by_class = train_df.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_class.columns = ['Total', 'Survived', 'Survival_Rate']
print("\n🎫 Survival by Class:")
display(survival_by_class)

sns.countplot(data=train_df, x='Pclass', hue='Survived')
plt.title('Survival by Passenger Class')
plt.legend(['Died', 'Survived'])

# 4. Survival by Embarkation
plt.subplot(2, 3, 4)
survival_by_embark = train_df.groupby('Embarked')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_embark.columns = ['Total', 'Survived', 'Survival_Rate']
print("\n🚢 Survival by Embarkation:")
display(survival_by_embark)

sns.countplot(data=train_df, x='Embarked', hue='Survived')
plt.title('Survival by Embarkation Port')
plt.legend(['Died', 'Survived'])

# 5. Survival by Age Groups
plt.subplot(2, 3, 5)
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                              labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
survival_by_age = train_df.groupby('AgeGroup')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_age.columns = ['Total', 'Survived', 'Survival_Rate']
print("\n🎂 Survival by Age Group:")
display(survival_by_age)

sns.countplot(data=train_df, x='AgeGroup', hue='Survived')
plt.title('Survival by Age Group')
plt.legend(['Died', 'Survived'])
plt.xticks(rotation=45)

# 6. Survival by Family Size
plt.subplot(2, 3, 6)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
survival_by_family = train_df.groupby('FamilySize')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_family.columns = ['Total', 'Survived', 'Survival_Rate']
print("\n👨‍👩‍👧‍👦 Survival by Family Size:")
display(survival_by_family)

sns.countplot(data=train_df, x='FamilySize', hue='Survived')
plt.title('Survival by Family Size')
plt.legend(['Died', 'Survived'])

plt.tight_layout()
plt.show()
