In [15]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [27]:
train_df = pd.read_csv("../dataset/train.csv")

In [28]:
train_df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

### Multi-stage imputation strategy for HomePlanet feature! 
(Based on EDA)

In [29]:
def impute_homeplanet(df):
    """
    Multi-stage imputation strategy for HomePlanet feature.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataset with HomePlanet missing values
        
    Returns:
    --------
    df : pandas DataFrame
        Dataset with HomePlanet imputed
    imputation_stats : dict
        Statistics about imputation stages
    """
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Track imputation statistics
    stats = {
        'total_missing': df['HomePlanet'].isna().sum(),
        'stage1_group': 0,
        'stage2_deck_rules': 0,
        'stage3_vip_rule': 0,
        'stage4_model': 0,
        'stage5_fallback': 0
    }
    
    # Extract features from Cabin
    df['Cabin_Deck'] = df['Cabin'].str.split('/').str[0]
    df['Cabin_Num'] = df['Cabin'].str.split('/').str[1]
    df['Cabin_Side'] = df['Cabin'].str.split('/').str[2]
    
    # Extract group ID from PassengerId
    df['GroupId'] = df['PassengerId'].str.split('_').str[0]
    
    # ============================================================
    # STAGE 1: Group-Based Imputation (Highest Confidence)
    # ============================================================
   
    missing_mask = df['HomePlanet'].isna()
    
    for idx in df[missing_mask].index:
        group_id = df.loc[idx, 'GroupId']
        
        # Get all passengers in the same group with known HomePlanet
        group_planets = df[(df['GroupId'] == group_id) & 
                          (df['HomePlanet'].notna())]['HomePlanet']
        
        if len(group_planets) > 0:
            # Use the most common HomePlanet in the group (usually all same)
            df.loc[idx, 'HomePlanet'] = group_planets.mode()[0]
            stats['stage1_group'] += 1
    
    # print(f"Imputed from group members: {stats['stage1_group']}")
    # print(f"Remaining missing: {df['HomePlanet'].isna().sum()}")
    
    # ============================================================
    # STAGE 2: Deck-Based Rules (Strong Patterns)
    # ============================================================
    
    missing_mask = df['HomePlanet'].isna()
    
    # Rule 1: Deck G → Earth (55.4% of Earth, almost no others)
    mask_g = missing_mask & (df['Cabin_Deck'] == 'G')
    df.loc[mask_g, 'HomePlanet'] = 'Earth'
    stats['stage2_deck_rules'] += mask_g.sum()
    # print(f"Deck G → Earth: {mask_g.sum()}")
    
    # Rule 2: Deck B → Europa (37% of Europa passengers)
    mask_b = missing_mask & (df['Cabin_Deck'] == 'B')
    df.loc[mask_b, 'HomePlanet'] = 'Europa'
    stats['stage2_deck_rules'] += mask_b.sum()
    # print(f"Deck B → Europa: {mask_b.sum()}")
    
    # Rule 3: Deck C → Europa (35.5% of Europa, ~77% on Starboard)
    mask_c = missing_mask & (df['Cabin_Deck'] == 'C')
    df.loc[mask_c, 'HomePlanet'] = 'Europa'
    stats['stage2_deck_rules'] += mask_c.sum()
    # print(f"Deck C → Europa: {mask_c.sum()}")
    
    # Rule 4: Deck A → Europa (12.2% of Europa, upper deck)
    mask_a = missing_mask & (df['Cabin_Deck'] == 'A')
    df.loc[mask_a, 'HomePlanet'] = 'Europa'
    stats['stage2_deck_rules'] += mask_a.sum()
    # print(f"Deck A → Europa: {mask_a.sum()}")
    
    # Rule 5: Deck F with Destination logic
    # F is shared between Earth (35.8%) and Mars (64.5%)
    # Use Destination to disambiguate
    missing_mask = df['HomePlanet'].isna()
    mask_f = missing_mask & (df['Cabin_Deck'] == 'F')
    
    # TRAPPIST-1e → Earth or Mars (check other features)
    # PSO J318.5-22 → Earth
    # 55 Cancri e → Europa (but shouldn't be on Deck F)
    mask_f_pso = mask_f & (df['Destination'] == 'PSO J318.5-22')
    df.loc[mask_f_pso, 'HomePlanet'] = 'Earth'
    stats['stage2_deck_rules'] += mask_f_pso.sum()
    # print(f"Deck F + PSO J318.5-22 → Earth: {mask_f_pso.sum()}")
    
    # Rule 6: Deck E → Mars (19.2% of Mars passengers)
    missing_mask = df['HomePlanet'].isna()
    mask_e = missing_mask & (df['Cabin_Deck'] == 'E')
    df.loc[mask_e, 'HomePlanet'] = 'Mars'
    stats['stage2_deck_rules'] += mask_e.sum()
    # print(f"Deck E → Mars: {mask_e.sum()}")
    
    # print(f"Remaining missing: {df['HomePlanet'].isna().sum()}")
    
    # ============================================================
    # STAGE 3: VIP Rule-Out + Destination Rules
    # ============================================================
    
    missing_mask = df['HomePlanet'].isna()
    
    # Rule: 55 Cancri e → Europa (very strong signal)
    mask_cancri = missing_mask & (df['Destination'] == '55 Cancri e')
    df.loc[mask_cancri, 'HomePlanet'] = 'Europa'
    stats['stage3_vip_rule'] += mask_cancri.sum()
    # print(f"55 Cancri e → Europa: {mask_cancri.sum()}")
    
    # Note: VIP rule-out will be used in Stage 4 model
    # print(f"Remaining missing: {df['HomePlanet'].isna().sum()}")
    
    # ============================================================
    # STAGE 4: Model-Based Imputation (Remaining Cases)
    # ============================================================
    
    remaining_missing = df['HomePlanet'].isna().sum()
    
    if remaining_missing > 0:
        # Prepare training data (passengers with known HomePlanet)
        train_mask = df['HomePlanet'].notna()
        
        # Select features for model
        feature_cols = ['Cabin_Deck', 'Cabin_Side', 'Destination', 
                       'CryoSleep', 'VIP', 'Age']
        
        # Create feature matrix
        X_train = df[train_mask][feature_cols].copy()
        y_train = df[train_mask]['HomePlanet'].copy()
        
        X_test = df[~train_mask][feature_cols].copy()
        
        # Handle missing values in features for model
        # Fill categorical with 'Unknown' and numeric with median
        for col in ['Cabin_Deck', 'Cabin_Side', 'Destination']:
            X_train[col] = X_train[col].fillna('Unknown')
            X_test[col] = X_test[col].fillna('Unknown')
        
        X_train['CryoSleep'] = X_train['CryoSleep'].astype('boolean').fillna(False)
        X_test['CryoSleep'] = X_test['CryoSleep'].astype('boolean').fillna(False)
        
        X_train['VIP'] = X_train['VIP'].astype('boolean').fillna(False)
        X_test['VIP'] = X_test['VIP'].astype('boolean').fillna(False)
        
        age_median = X_train['Age'].median()
        X_train['Age'] = X_train['Age'].fillna(age_median)
        X_test['Age'] = X_test['Age'].fillna(age_median)
        
        # Encode categorical variables
        encoders = {}
        for col in ['Cabin_Deck', 'Cabin_Side', 'Destination']:
            le = LabelEncoder()
            # Fit on combined data to handle unseen categories
            combined = pd.concat([X_train[col], X_test[col]])
            le.fit(combined)
            X_train[col] = le.transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
            encoders[col] = le
        
        # Convert boolean to int
        X_train['CryoSleep'] = X_train['CryoSleep'].astype(int)
        X_test['CryoSleep'] = X_test['CryoSleep'].astype(int)
        X_train['VIP'] = X_train['VIP'].astype(int)
        X_test['VIP'] = X_test['VIP'].astype(int)
        
        # Train Random Forest
        rf = RandomForestClassifier(n_estimators=100, random_state=42, 
                                    max_depth=10, min_samples_split=20)
        rf.fit(X_train, y_train)
        
        # Predict
        predictions = rf.predict(X_test)
        
        # Apply VIP rule-out: if VIP=True, cannot be Earth
        vip_mask = df[~train_mask]['VIP'] == True
        earth_predicted = predictions == 'Earth'
        conflict_mask = vip_mask & earth_predicted
        
        if conflict_mask.sum() > 0:
            # For VIP passengers predicted as Earth, choose between Europa/Mars
            # based on second-highest probability
            probas = rf.predict_proba(X_test)
            class_names = rf.classes_
            
            for i, is_conflict in enumerate(conflict_mask):
                if is_conflict:
                    # Get probabilities for Europa and Mars only
                    europa_idx = np.where(class_names == 'Europa')[0]
                    mars_idx = np.where(class_names == 'Mars')[0]
                    
                    if len(europa_idx) > 0 and len(mars_idx) > 0:
                        if probas[i][europa_idx[0]] > probas[i][mars_idx[0]]:
                            predictions[i] = 'Europa'
                        else:
                            predictions[i] = 'Mars'
        
        # Apply predictions
        df.loc[~train_mask, 'HomePlanet'] = predictions
        stats['stage4_model'] += len(predictions)
        
        # print(f"Model-based imputation: {len(predictions)}")
        # print(f"Feature importances:")
        # for feat, imp in zip(feature_cols, rf.feature_importances_):
        #     print(f"  {feat}: {imp:.4f}")
    
    #print(f"Remaining missing: {df['HomePlanet'].isna().sum()}")
    
    # ============================================================
    # STAGE 5: Fallback (should be 0 or very few)
    # ============================================================
    remaining = df['HomePlanet'].isna().sum()
    if remaining > 0:
        most_common = df['HomePlanet'].mode()[0]
        df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = most_common
        stats['stage5_fallback'] = remaining
        #print(f"Fallback imputation: {remaining} → {most_common}")
    
    # ============================================================
    # Summary
    # ============================================================
    print("\n" + "="*60)
    print("IMPUTATION SUMMARY")
    print("="*60)
    print(f"Total missing values: {stats['total_missing']}")
    print(f"Stage 1 (Group-based): {stats['stage1_group']} ({stats['stage1_group']/stats['total_missing']*100:.1f}%)")
    print(f"Stage 2 (Deck rules): {stats['stage2_deck_rules']} ({stats['stage2_deck_rules']/stats['total_missing']*100:.1f}%)")
    print(f"Stage 3 (VIP/Dest rules): {stats['stage3_vip_rule']} ({stats['stage3_vip_rule']/stats['total_missing']*100:.1f}%)")
    print(f"Stage 4 (Model-based): {stats['stage4_model']} ({stats['stage4_model']/stats['total_missing']*100:.1f}%)")
    print(f"Stage 5 (Fallback): {stats['stage5_fallback']} ({stats['stage5_fallback']/stats['total_missing']*100:.1f}%)")
    print(f"\nFinal missing values: {df['HomePlanet'].isna().sum()}")
    
    # Drop temporary columns
    df = df.drop(columns=['Cabin_Deck', 'Cabin_Num', 'Cabin_Side', 'GroupId'])
    
    return df, stats

In [30]:
df_train_imputed, imputation_stats = impute_homeplanet(train_df)


IMPUTATION SUMMARY
Total missing values: 201
Stage 1 (Group-based): 90 (44.8%)
Stage 2 (Deck rules): 61 (30.3%)
Stage 3 (VIP/Dest rules): 7 (3.5%)
Stage 4 (Model-based): 43 (21.4%)
Stage 5 (Fallback): 0 (0.0%)

Final missing values: 0


In [19]:
train_df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
