# Titanic - Feature Engineering

Features basees sur l'EDA et les solutions gagnantes Kaggle.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Combine for consistent feature engineering
train['is_train'] = 1
test['is_train'] = 0
test['Survived'] = np.nan
df = pd.concat([train, test], ignore_index=True)

print(f'Combined shape: {df.shape}')
print(f'Train: {train.shape[0]}, Test: {test.shape[0]}')

## 1. Title Extraction

In [None]:
# Extract title from Name
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Group rare titles
title_map = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Countess': 'Rare',
    'Lady': 'Rare', 'Sir': 'Rare', 'Don': 'Rare', 'Dona': 'Rare',
    'Jonkheer': 'Rare', 'Capt': 'Rare'
}
df['Title'] = df['Title'].map(title_map).fillna('Rare')

print('Title distribution:')
print(df['Title'].value_counts())

## 2. Missing Values Imputation

In [None]:
# Age: impute by Title median (best strategy for Titanic)
age_by_title = df.groupby('Title')['Age'].median()
print('Median age by title:')
print(age_by_title)

for title in df['Title'].unique():
    mask = (df['Age'].isnull()) & (df['Title'] == title)
    df.loc[mask, 'Age'] = age_by_title[title]

# Embarked: fill with mode
df['Embarked'] = df['Embarked'].fillna('S')

# Fare: fill with median by Pclass
df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))

print(f'\nRemaining missing: {df.isnull().sum()[df.isnull().sum() > 0]}')

## 3. Family Features

In [None]:
# Family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Is alone
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Family size categories (based on EDA findings)
def family_category(size):
    if size == 1:
        return 'Alone'
    elif size <= 4:
        return 'Small'
    else:
        return 'Large'

df['FamilyCategory'] = df['FamilySize'].apply(family_category)

print('Family categories:')
print(df['FamilyCategory'].value_counts())

## 4. Ticket Features

In [None]:
# Ticket frequency (shared tickets = traveling together)
ticket_counts = df['Ticket'].value_counts()
df['TicketFreq'] = df['Ticket'].map(ticket_counts)

# Ticket prefix
df['TicketPrefix'] = df['Ticket'].apply(
    lambda x: x.split()[0] if len(x.split()) > 1 else 'NONE'
)
df['TicketPrefix'] = df['TicketPrefix'].str.replace('.', '', regex=False).str.replace('/', '', regex=False).str.upper()

# Is numeric ticket
df['TicketIsNum'] = df['Ticket'].str.isnumeric().astype(int)

print(f'Ticket frequencies: min={df.TicketFreq.min()}, max={df.TicketFreq.max()}')
print(f'\nTop ticket prefixes:')
print(df['TicketPrefix'].value_counts().head(10))

## 5. Cabin Features

In [None]:
# Has cabin
df['HasCabin'] = df['Cabin'].notna().astype(int)

# Cabin deck (first letter)
df['CabinDeck'] = df['Cabin'].str[0].fillna('Unknown')

# Number of cabins
df['CabinCount'] = df['Cabin'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

print('Cabin deck distribution:')
print(df['CabinDeck'].value_counts())

## 6. Age Features

In [None]:
# Age categories
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 5, 12, 18, 35, 60, 100],
                      labels=['Baby', 'Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# Is child
df['IsChild'] = (df['Age'] <= 12).astype(int)

# Age * Class interaction
df['Age_Pclass'] = df['Age'] * df['Pclass']

print('Age bins:')
print(df['AgeBin'].value_counts().sort_index())

## 7. Fare Features

In [None]:
# Log fare
df['LogFare'] = np.log1p(df['Fare'])

# Fare per person (shared ticket)
df['FarePerPerson'] = df['Fare'] / df['TicketFreq']

# Fare bins
df['FareBin'] = pd.qcut(df['Fare'], q=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])

print('Fare bins:')
print(df['FareBin'].value_counts().sort_index())

## 8. Surname / Group Features (Gold Medal trick)

In [None]:
# Extract surname
df['Surname'] = df['Name'].str.split(',').str[0]

# Surname frequency
surname_counts = df['Surname'].value_counts()
df['SurnameFreq'] = df['Surname'].map(surname_counts)

# Women-children-first rule: survival of women/children in same group
# This is a powerful feature in top Titanic solutions
df['SexNum'] = (df['Sex'] == 'female').astype(int)

print(f'Unique surnames: {df.Surname.nunique()}')
print(f'Max surname freq: {df.SurnameFreq.max()}')

## 9. Encode Categorical Variables

In [None]:
# Label encoding for categorical features
cat_cols_to_encode = ['Title', 'FamilyCategory', 'CabinDeck', 'AgeBin', 'FareBin', 'Embarked']

label_encoders = {}
for col in cat_cols_to_encode:
    le = LabelEncoder()
    df[f'{col}_enc'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Sex encoding
df['Sex_enc'] = (df['Sex'] == 'male').astype(int)

print('Encoded columns created.')

## 10. Final Feature Selection

In [None]:
# Select final features
features = [
    'Pclass', 'Sex_enc', 'Age', 'SibSp', 'Parch', 'Fare',
    'Title_enc', 'FamilySize', 'IsAlone', 'FamilyCategory_enc',
    'TicketFreq', 'TicketIsNum',
    'HasCabin', 'CabinDeck_enc', 'CabinCount',
    'IsChild', 'Age_Pclass', 'AgeBin_enc',
    'LogFare', 'FarePerPerson', 'FareBin_enc',
    'Embarked_enc',
    'SurnameFreq', 'SexNum',
]

# Split back to train/test
train_fe = df[df['is_train'] == 1].copy()
test_fe = df[df['is_train'] == 0].copy()

X = train_fe[features].values
y = train_fe['Survived'].values.astype(int)
X_test = test_fe[features].values
test_ids = test_fe['PassengerId'].values

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'\nFeatures ({len(features)}):')
for i, f in enumerate(features):
    print(f'  {i+1}. {f}')

In [None]:
# Save processed data
train_fe[['PassengerId', 'Survived'] + features].to_csv('../data/train_fe.csv', index=False)
test_fe[['PassengerId'] + features].to_csv('../data/test_fe.csv', index=False)

print('Saved: train_fe.csv, test_fe.csv')
print(f'Train: {train_fe.shape[0]} rows, {len(features)} features')
print(f'Test: {test_fe.shape[0]} rows, {len(features)} features')