In [23]:
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')

def advanced_titanic_feature_engineering(df):
    """
    Advanced feature engineering pipeline for Titanic dataset.
    Works even if some columns (Cabin, Ticket, Embarked) are missing.
    """
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # 1. MISSING VALUE INDICATORS
    data['Age_Missing'] = data['Age'].isnull().astype(int) if 'Age' in data else 0
    if 'Cabin' in data:
        data['Cabin_Missing'] = data['Cabin'].isnull().astype(int)
    if 'Embarked' in data:
        data['Embarked_Missing'] = data['Embarked'].isnull().astype(int)
    
    # 2. HANDLE MISSING VALUES
    if 'Age' in data and 'Pclass' in data and 'Title' in data:
        age_title_class = data.groupby(['Title', 'Pclass'])['Age'].median()
        for title in data['Title'].unique():
            for pclass in data['Pclass'].unique():
                mask = (data['Title'] == title) & (data['Pclass'] == pclass) & (data['Age'].isnull())
                if (title, pclass) in age_title_class:
                    data.loc[mask, 'Age'] = age_title_class[(title, pclass)]
        data['Age'].fillna(data['Age'].median(), inplace=True)
    
    if 'Embarked' in data:
        data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    if 'Fare' in data and 'Pclass' in data:
        data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('median'), inplace=True)
    
    # 3. FAMILY-RELATED FEATURES
    if {'SibSp','Parch'}.issubset(data.columns):
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    else:
        data['FamilySize'] = 1
        data['IsAlone'] = 1
    
    def family_size_category(size):
        if size == 1:
            return 0
        elif size <= 4:
            return 1
        else:
            return 2
    data['FamilySizeCategory'] = data['FamilySize'].apply(family_size_category)
    
    # 4. NAME-BASED FEATURES
    if 'Name' in data:
        data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
            'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
            'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
            'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
            'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
        }
        data['Title'] = data['Title'].map(title_mapping).fillna('Rare')
        data['NameLength'] = data['Name'].apply(len)
    else:
        data['Title'] = 'Unknown'
        data['NameLength'] = 0
    
    # 5. CABIN-BASED FEATURES
    if 'Cabin' in data:
        data['Deck'] = data['Cabin'].str[0].fillna('Unknown')
        data['NumCabins'] = data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else len(x.split()))
    else:
        data['Deck'] = 'Unknown'
        data['NumCabins'] = 0
    
    # 6. TICKET-BASED FEATURES
    if 'Ticket' in data:
        data['TicketPrefix'] = data['Ticket'].str.extract(r'([A-Za-z]+)', expand=False).fillna('None')
        data['TicketNumber'] = pd.to_numeric(
            data['Ticket'].str.extract(r'(\d+)', expand=False),
            errors='coerce'
        ).fillna(0)
    else:
        data['TicketPrefix'] = 'None'
        data['TicketNumber'] = 0
    
    # 7. AGE-BASED FEATURES
    if 'Age' in data:
        def age_category(age):
            if age <= 12:
                return 0
            elif age <= 18:
                return 1
            elif age <= 35:
                return 2
            elif age <= 60:
                return 3
            else:
                return 4
        data['AgeCategory'] = data['Age'].apply(age_category)
        data['AgeBin'] = pd.cut(data['Age'], bins=[0,12,18,35,60,100], labels=[0,1,2,3,4])
    else:
        data['AgeCategory'] = 2
        data['AgeBin'] = 2
    
    # 8. FARE-BASED FEATURES
    if 'Fare' in data:
        data['FarePerPerson'] = data['Fare'] / data['FamilySize']
        data['FareCategory'] = pd.qcut(data['Fare'], q=4, labels=[0,1,2,3])
        data['FareLog'] = np.log1p(data['Fare'])
    else:
        data['FarePerPerson'] = 0
        data['FareCategory'] = 0
        data['FareLog'] = 0
    
    return data


In [24]:
# Apply feature engineering
titanic_engineered = advanced_titanic_feature_engineering(df)

# Display new features
print("Original features:", df.shape[1])
print("After feature engineering:", titanic_engineered.shape[1])
print("New features created:", titanic_engineered.shape[1] - df.shape[1])

Original features: 12
After feature engineering: 29
New features created: 17
