# 🔧 Titanic Survival Prediction - Feature Engineering

## 📊 Mục tiêu
- Tạo các features mới từ dữ liệu thô
- Xử lý missing values
- Chuẩn bị dữ liệu cho model training
- Tối ưu hóa performance của models

## 📋 Nội dung
1. **Data Loading & Setup**
2. **Title Extraction from Name**
3. **Family Features Creation**
4. **Age Processing & Grouping**
5. **Fare Processing & Binning**
6. **Cabin Features Extraction**
7. **Missing Values Handling**
8. **Feature Encoding**
9. **Feature Selection & Validation**


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📥 Data Loading & Setup


In [None]:
# Load datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("🚢 Titanic Dataset Loaded Successfully!")
print(f"📊 Training set shape: {train_df.shape}")
print(f"📊 Test set shape: {test_df.shape}")

# Combine datasets for consistent feature engineering
all_data = pd.concat([train_df, test_df], ignore_index=True, sort=False)
print(f"📊 Combined dataset shape: {all_data.shape}")

# Display first few rows
print("\n🔍 Combined Data Preview:")
display(all_data.head())


## 2. 📝 Title Extraction from Name


In [None]:
# Extract title from Name
def extract_title(name):
    """Extract title from passenger name"""
    title = name.split(',')[1].split('.')[0].strip()
    return title

# Apply title extraction
all_data['Title'] = all_data['Name'].apply(extract_title)

# Display unique titles
print("📝 Unique Titles Found:")
print("=" * 40)
title_counts = all_data['Title'].value_counts()
print(title_counts)

# Group rare titles
def group_titles(title):
    """Group rare titles into common categories"""
    if title in ['Mr']:
        return 'Mr'
    elif title in ['Miss', 'Mlle']:
        return 'Miss'
    elif title in ['Mrs', 'Mme']:
        return 'Mrs'
    elif title in ['Master']:
        return 'Master'
    elif title in ['Dr', 'Rev', 'Col', 'Major', 'Capt']:
        return 'Officer'
    else:
        return 'Rare'

all_data['TitleGroup'] = all_data['Title'].apply(group_titles)

# Display grouped titles
print("\n📝 Grouped Titles:")
print("=" * 40)
grouped_title_counts = all_data['TitleGroup'].value_counts()
print(grouped_title_counts)

# Visualization
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
title_counts.plot(kind='bar')
plt.title('Original Titles Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
grouped_title_counts.plot(kind='bar', color='coral')
plt.title('Grouped Titles Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


## 3. 👨‍👩‍👧‍👦 Family Features Creation


In [None]:
# Create family-related features
def create_family_features(df):
    """Create family-related features"""
    # Family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Is alone
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Family size groups
    df['FamilySizeGroup'] = pd.cut(df['FamilySize'], 
                                  bins=[0, 1, 4, 7, 20], 
                                  labels=['Alone', 'Small', 'Medium', 'Large'])
    
    return df

# Apply family features
all_data = create_family_features(all_data)

# Display family features
print("👨‍👩‍👧‍👦 Family Features Created:")
print("=" * 40)
print(f"Family Size Statistics:")
print(all_data['FamilySize'].describe())
print(f"\nIs Alone Distribution:")
print(all_data['IsAlone'].value_counts())
print(f"\nFamily Size Group Distribution:")
print(all_data['FamilySizeGroup'].value_counts())

# Visualization
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
all_data['FamilySize'].hist(bins=20, alpha=0.7, color='skyblue')
plt.title('Family Size Distribution')
plt.xlabel('Family Size')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
all_data['IsAlone'].value_counts().plot(kind='bar', color='lightgreen')
plt.title('Is Alone Distribution')
plt.xlabel('Is Alone')
plt.ylabel('Count')
plt.xticks([0, 1], ['With Family', 'Alone'], rotation=0)

plt.subplot(1, 3, 3)
all_data['FamilySizeGroup'].value_counts().plot(kind='bar', color='coral')
plt.title('Family Size Group Distribution')
plt.xlabel('Family Size Group')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()
