## Main Objective: 
The task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly.

In [6]:
import pandas as pd
import numpy as np

In [7]:
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

In [11]:
# Storing IDs for submission
test_ids = test['PassengerId'].copy()

In [None]:
# Combine for consistent preprocessing
train['is_train'] = 1
test['is_train'] = 0
test['Transported'] = np.nan

In [14]:
df = pd.concat([train, test], axis=0, ignore_index=True)

In [16]:
missing_before = df.isnull().sum()
missing_before

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
is_train           0
dtype: int64

In [21]:
# Imputing categorical features with Mode
categorical_features = ['HomePlanet', 'Destination', 'VIP']
for col in categorical_features:
    mode_value = df[col].mode()[0]
    df.fillna({col: mode_value}, inplace=True)

In [24]:
# Imputing 'CryoSleep' rule-based
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
total_spent = df[spending_cols].sum(axis=1)
# Filling missing spendings with zero!
df[spending_cols] = df[spending_cols].fillna(0)

In [25]:
# Apply rule
mask_zero_spending = (total_spent == 0) & (df['CryoSleep'].isna())
df.loc[mask_zero_spending, 'CryoSleep'] = True
print(f"  - Filled {mask_zero_spending.sum()} CryoSleep values as True (zero spending)")

  - Filled 136 CryoSleep values as True (zero spending)


In [28]:
# Remaining: fill with mode (False)
df.fillna({'CryoSleep': False}, inplace=True)
print(f"  - Filled remaining CryoSleep with mode: False")

  - Filled remaining CryoSleep with mode: False


In [None]:
# Filling 'age' with median
median_age = df['Age'].median()
df.fillna({'Age': median_age}, inplace=True)
print(f"  - Age: filled with median = {median_age:.1f}")

  - Age: filled with median = 27.0


In [31]:
missing_after = df.isnull().sum()
missing_after

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin            299
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             294
Transported     4277
is_train           0
dtype: int64

#### Feature Engineering

In [32]:
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
df['GroupSize'] = df.groupby('Group')['Group'].transform('count')
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

In [34]:
# Split Cabin into components
df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else 'Unknown')
df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else -1)
df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else 'Unknown')

In [36]:
# Impute Deck with mode
deck_mode = df['Deck'].mode()[0]
df['Deck'] = df['Deck'].replace('Unknown', deck_mode)

# Impute Side with mode
side_mode = df['Side'].mode()[0]
df['Side'] = df['Side'].replace('Unknown', side_mode)

# Impute CabinNum with median
cabin_num_median = df[df['CabinNum'] != -1]['CabinNum'].median()
df.loc[df['CabinNum'] == -1, 'CabinNum'] = cabin_num_median

In [38]:
df['LastName'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else 'Unknown')
df['FamilySize'] = df.groupby('LastName')['LastName'].transform('count')

In [40]:
# Total spending
df['TotalSpent'] = df[spending_cols].sum(axis=1)

In [42]:
# Spending indicators
df['HasSpending'] = (df['TotalSpent'] > 0).astype(int)
df['NumAmenitiesUsed'] = (df[spending_cols] > 0).sum(axis=1)

In [44]:
# Log transform spending (to handle skewness)
for col in spending_cols + ['TotalSpent']:
    df[f'{col}_log'] = np.log1p(df[col])

In [None]:
# Age bins
df['AgeBin'] = pd.cut(df['Age'], 
                      bins=[0, 12, 18, 25, 35, 50, 100], 
                      labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior'])

In [47]:
# CryoSleep × Age (children behave differently when awake)
df['CryoSleep_Age'] = df['CryoSleep'].astype(int) * df['Age']

In [48]:
# CryoSleep × Spending (should be zero if in CryoSleep)
df['CryoSleep_Spending'] = df['CryoSleep'].astype(int) * df['TotalSpent']

In [49]:
# HomePlanet × Destination (travel patterns)
df['Planet_Destination'] = df['HomePlanet'] + '_to_' + df['Destination']

In [51]:
# Deck × Side (spatial location)
df['Deck_Side'] = df['Deck'] + '_' + df['Side']

In [53]:
# VIP × Spending (VIPs might spend differently)
df['VIP_Spending'] = df['VIP'].astype(int) * df['TotalSpent']

In [56]:
# Spending per amenity diversity
df['SpendingPerAmenity'] = df['TotalSpent'] / (df['NumAmenitiesUsed'] + 1)  # +1 to avoid division by zero

In [55]:
# Group spending (might indicate group behavior)
df['GroupTotalSpent'] = df.groupby('Group')['TotalSpent'].transform('sum')
df['GroupAvgSpent'] = df.groupby('Group')['TotalSpent'].transform('mean')

#### Modeling

In [58]:
# Split back into train and test
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

In [59]:
# Define features to drop
drop_cols = ['PassengerId', 'Name', 'Cabin', 'Transported', 'is_train', 'Group', 'LastName']

In [60]:
# Separate target
y = train_df['Transported'].astype(int)

In [61]:
# Define categorical features for LightGBM
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 
    'Deck', 'Side', 'AgeBin', 'IsAlone',
    'Planet_Destination', 'Deck_Side'
]

In [62]:
# Prepare X_train and X_test
X_train = train_df.drop(columns=drop_cols)
X_test = test_df.drop(columns=drop_cols)

In [64]:
# Convert categorical columns to 'category' dtype for LightGBM
for col in categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

In [65]:
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_error',  # Direct accuracy optimization
    'boosting_type': 'gbdt',
    'num_leaves': 31,  # Conservative (2^depth - 1, where depth ≈ 5)
    'learning_rate': 0.05,  # Moderate learning rate
    'feature_fraction': 0.8,  # Use 80% of features per tree
    'bagging_fraction': 0.8,  # Use 80% of data per iteration
    'bagging_freq': 5,
    'max_depth': -1,  # No limit (controlled by num_leaves)
    'min_child_samples': 20,  # Minimum samples per leaf
    'reg_alpha': 0.1,  # L1 regularization
    'reg_lambda': 0.1,  # L2 regularization
    'random_state': 42,
    'verbose': -1,
    'n_jobs': -1
}

In [67]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import warnings

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
fold_num = 1

In [68]:
for train_idx, val_idx in skf.split(X_train, y):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_val = lgb.Dataset(X_val, y_val, categorical_feature=categorical_features, reference=lgb_train)
    
    # Train model
    model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ]
    )
    
    # Predict on validation
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    fold_score = accuracy_score(y_val, y_pred)
    cv_scores.append(fold_score)
    
    print(f"  Fold {fold_num}: Accuracy = {fold_score:.4f} (Best iteration: {model.best_iteration})")
    fold_num += 1

print(f"\n{'='*80}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*80}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Individual Fold Scores: {[f'{s:.4f}' for s in cv_scores]}")

  Fold 1: Accuracy = 0.8177 (Best iteration: 62)
  Fold 2: Accuracy = 0.8166 (Best iteration: 92)
  Fold 3: Accuracy = 0.8189 (Best iteration: 52)
  Fold 4: Accuracy = 0.8234 (Best iteration: 158)
  Fold 5: Accuracy = 0.8032 (Best iteration: 37)

CROSS-VALIDATION RESULTS
Mean CV Accuracy: 0.8159 (+/- 0.0068)
Individual Fold Scores: ['0.8177', '0.8166', '0.8189', '0.8234', '0.8032']


In [69]:
lgb_train_full = lgb.Dataset(X_train, y, categorical_feature=categorical_features)

final_model = lgb.train(
    lgb_params,
    lgb_train_full,
    num_boost_round=1000,
    valid_sets=[lgb_train_full],
    valid_names=['train'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=100)
    ]
)

print(f"Final model trained with {final_model.best_iteration} iterations")



[100]	train's binary_error: 0.137352
[200]	train's binary_error: 0.102841
[300]	train's binary_error: 0.0773036
[400]	train's binary_error: 0.0592431
[500]	train's binary_error: 0.0422179
[600]	train's binary_error: 0.0300242
[700]	train's binary_error: 0.0222018
[800]	train's binary_error: 0.0146095
[900]	train's binary_error: 0.0104682
[1000]	train's binary_error: 0.00701714
Final model trained with 0 iterations


In [70]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(feature_importance.head(20).to_string(index=False))

           feature   importance
        TotalSpent 15758.313428
          CabinNum 11348.079281
               Age  5960.618458
         FoodCourt  5344.717273
               Spa  4746.503796
            VRDeck  4603.621712
         CryoSleep  4496.432177
   GroupTotalSpent  4264.989551
      ShoppingMall  4180.772253
        FamilySize  3813.045220
     GroupAvgSpent  3769.719917
         Deck_Side  3665.264496
SpendingPerAmenity  3541.512270
       RoomService  3207.516605
     CryoSleep_Age  3150.739063
              Deck  3090.763811
        HomePlanet  1766.895274
              Side  1415.990321
   RoomService_log  1359.671616
  ShoppingMall_log  1352.337475


In [71]:
y_train_pred = (final_model.predict(X_train) > 0.5).astype(int)
train_accuracy = accuracy_score(y, y_train_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print("\nConfusion Matrix (Train):")
print(confusion_matrix(y, y_train_pred))
print("\nClassification Report (Train):")
print(classification_report(y, y_train_pred, target_names=['Not Transported', 'Transported']))

Training Accuracy: 0.9930

Confusion Matrix (Train):
[[4278   37]
 [  24 4354]]

Classification Report (Train):
                 precision    recall  f1-score   support

Not Transported       0.99      0.99      0.99      4315
    Transported       0.99      0.99      0.99      4378

       accuracy                           0.99      8693
      macro avg       0.99      0.99      0.99      8693
   weighted avg       0.99      0.99      0.99      8693



In [72]:
y_test_pred = (final_model.predict(X_test) > 0.5).astype(int)
y_test_pred_bool = y_test_pred.astype(bool)

print(f"Test predictions generated: {len(y_test_pred)}")
print(f"Predicted Transported: {y_test_pred.sum()} ({y_test_pred.mean()*100:.1f}%)")

Test predictions generated: 4277
Predicted Transported: 2101 (49.1%)


In [73]:
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': y_test_pred_bool
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved: submission.csv")
print(f"\nFirst 10 predictions:")
print(submission.head(10))

Submission file saved: submission.csv

First 10 predictions:
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
5     0027_01        False
6     0029_01         True
7     0032_01         True
8     0032_02         True
9     0033_01         True


In [57]:
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,TotalSpent_log,AgeBin,CryoSleep_Age,CryoSleep_Spending,Planet_Destination,Deck_Side,VIP_Spending,GroupTotalSpent,GroupAvgSpent,SpendingPerAmenity
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,MiddleAge,0.0,0.0,Europa_to_TRAPPIST-1e,B_P,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,6.602588,YoungAdult,0.0,0.0,Earth_to_TRAPPIST-1e,F_S,0.0,736.0,736.0,122.666667
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,9.248021,Senior,0.0,0.0,Europa_to_TRAPPIST-1e,A_S,10383.0,15559.0,7779.5,2076.6


In [13]:
train.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,is_train
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1


In [10]:
test.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,is_train,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,
