In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")

In [4]:
from sklearn.ensemble import RandomForestRegressor

def preprocess(df):
    df["Group"] = df["PassengerId"].str.split("_").str[0]
    df[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = df["Cabin"].str.split("/", expand=True)
    df["Cabin_Num"] = pd.to_numeric(df["Cabin_Num"], errors="coerce")
    df['LastName'] = df['Name'].str.split(" ").str[1]
    return df

def fill_from_group(df, col):
    grp_mode = df.groupby("Group")[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df[col] = df.apply(lambda r: grp_mode[r["Group"]] if pd.isna(r[col]) else r[col], axis=1)
    return df

def impute_rule_based(df):
    # ---- HomePlanet rules ----
    missing = df["HomePlanet"].isna()
    cond = missing & df["VIP"].eq(True)
    df.loc[cond, "HomePlanet"] = "Europa"   # Europa VIP dominates Mars VIP

    cond = missing & df["Cabin_Deck"].isin(["A", "B", "C"])
    df.loc[cond, "HomePlanet"] = "Europa"

    cond = missing & df["Cabin_Deck"].isin(["F", "G"])
    df.loc[cond, "HomePlanet"] = "Earth"

    cond = missing & df["Cabin_Deck"].isin(["D", "E"])
    df.loc[cond, "HomePlanet"] = "Mars"

    cond = missing & df["Destination"].eq("55 Cancri e")
    df.loc[cond, "HomePlanet"] = "Europa"

    cond = missing & df["Destination"].eq("PSO J318.5-22")
    df.loc[cond, "HomePlanet"] = "Earth"

    cond = df["HomePlanet"].isna()
    df.loc[cond, "HomePlanet"] = "Earth"  # global mode


    # ---- CryoSleep rules ----
    amenities = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    total_spending = df[amenities].sum(axis=1)

    cond = df["CryoSleep"].isna() & (total_spending > 0)
    df.loc[cond, "CryoSleep"] = False

    cond = df["CryoSleep"].isna() & (total_spending == 0)
    df.loc[cond, "CryoSleep"] = True

    cond = df["CryoSleep"].isna() & df["Cabin_Deck"].isin(["B", "G"])
    df.loc[cond, "CryoSleep"] = True

    cond = df["CryoSleep"].isna() & df["Cabin_Deck"].isin(["E", "F", "T"])
    df.loc[cond, "CryoSleep"] = False


    # ---- Destination rules ----
    cond = df["Destination"].isna() & df["HomePlanet"].eq("Europa")
    df.loc[cond, "Destination"] = "55 Cancri e"

    cond = df["Destination"].isna() & df["HomePlanet"].eq("Mars")
    df.loc[cond, "Destination"] = "TRAPPIST-1e"

    cond = df["Destination"].isna() & df["HomePlanet"].eq("Earth")
    df.loc[cond, "Destination"] = "TRAPPIST-1e"

    cond = df["Destination"].isna()
    df.loc[cond, "Destination"] = "TRAPPIST-1e"  # global mode


    # ---- VIP rules ----
    cond = df["VIP"].isna() & (total_spending > total_spending.quantile(0.75))
    df.loc[cond, "VIP"] = True
    #mple, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, t
    df['VIP'] = df['VIP'].fillna(False)


    # ---- Cabin Deck/Side/Num ----
    df["Cabin_Deck"] =  df["Cabin_Deck"].fillna(df["Cabin_Deck"].mode()[0])
    df["Cabin_Side"] = df["Cabin_Side"].fillna(df["Cabin_Side"].mode()[0])

    # Cabin_Num → group median → deck median → global
    grp_med = df.groupby("Group")["Cabin_Num"].transform("median")
    df["Cabin_Num"] = df["Cabin_Num"].fillna(grp_med)
    deck_med = df.groupby("Cabin_Deck")["Cabin_Num"].transform("median")
    df["Cabin_Num"] = df["Cabin_Num"].fillna(deck_med)
    df["Cabin_Num"] = df["Cabin_Num"].fillna(df["Cabin_Num"].median())

    return df

def ml_impute_age_and_spending(df):
    amenities = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

    # Set spending = 0 for CryoSleep == True
    for col in amenities:
        df.loc[df["CryoSleep"] == True, col] = df.loc[df["CryoSleep"] == True, col].fillna(0)

    # Train ML model for Age
    features = ["HomePlanet", "Destination", "VIP", "CryoSleep", "Cabin_Deck", "Cabin_Num"]
    df_ml = pd.get_dummies(df[features], drop_first=True)

    train_mask = df["Age"].notna()
    model = RandomForestRegressor(n_estimators=250, random_state=42)
    model.fit(df_ml[train_mask], df.loc[train_mask, "Age"])
    pred = model.predict(df_ml[df["Age"].isna()])
    df.loc[df["Age"].isna(), "Age"] = pred

    # Final fill for spending values (group median > 0)
    grp_med = df.groupby("Group")[amenities].transform("median")
    for col in amenities:
        df[col] = df[col].fillna(grp_med[col])
        df[col] = df[col].fillna(df[col].median())

    return df


In [5]:
def impute_pipeline(df):
    df = preprocess(df)

    # Group-based first
    for col in ["HomePlanet", "Destination", "Cabin_Deck", "Cabin_Side", "CryoSleep", "VIP", "LastName"]:
        df = fill_from_group(df, col)

    df = impute_rule_based(df)
    df = ml_impute_age_and_spending(df)

    return df

In [None]:
train_processed = impute_pipeline(train_df.copy())
test_processed = impute_pipeline(test_df.copy())

In [11]:
def add_engineered_features(df):
    # ---------- 1. Total / Category Spending ----------
    df["TotalSpending"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    df["LuxurySpending"] = df[["FoodCourt", "ShoppingMall", "Spa"]].sum(axis=1)
    df["EntertainmentSpending"] = df["VRDeck"]

    # Spending flags
    df["SpentNothing"] = (df["TotalSpending"] == 0).astype(int)
    df["LuxuryUser"] = (df["LuxurySpending"] > 0).astype(int)

    # ---------- 2. Group Statistics ----------
    # Group size
    group_size = df.groupby("Group")["PassengerId"].transform("count")
    df["GroupSize"] = group_size

    # Group high spender trait
    group_max_spend = df.groupby("Group")["TotalSpending"].transform("max")
    df["GroupMaxSpending"] = group_max_spend

    # CryoSleep ratio in group
    df["GroupCryoRatio"] = df.groupby("Group")["CryoSleep"].transform("mean")

    # Spending difference vs group median
    df["GroupMedianSpending"] = df.groupby("Group")["TotalSpending"].transform("median")
    df["SpendingDiff"] = df["TotalSpending"] - df["GroupMedianSpending"]

    # ---------- 3. Deck rank / cabin location ----------
    deck_order = {"A": 7, "B": 6, "C": 5, "D": 4, "E": 3, "F": 2, "G": 1, "T": 0}
    df["DeckRank"] = df["Cabin_Deck"].map(deck_order)

    # Forward (low Cabin_Num) vs Rear (high Cabin_Num)
    median_cabin_num = df["Cabin_Num"].median()
    df["IsRearCabin"] = (df["Cabin_Num"] >= median_cabin_num).astype(int)
    df["IsPort"] = (df["Cabin_Side"] == "P").astype(int)
    df["IsStar"] = (df["Cabin_Side"] == "S").astype(int)

    # ---------- 4. Age categories ----------
    df["Child"] = (df["Age"] < 12).astype(int)
    df["Teen"] = ((df["Age"] >= 12) & (df["Age"] < 20)).astype(int)
    df["Adult"] = ((df["Age"] >= 20) & (df["Age"] < 60)).astype(int)
    df["Senior"] = (df["Age"] >= 60).astype(int)


    # ---------- 5. CryoSleep interactions ----------
    df["Awake"] = (~df["CryoSleep"]).astype(int)
    df["SpentNothingAwake"] = ((df["TotalSpending"] == 0) & (df["CryoSleep"] == False)).astype(int)
    df["Awake_Luxury"] = ((df["LuxurySpending"] > 0) & (df["CryoSleep"] == False)).astype(int)

    # ---------- 6. Cross-feature: Deck × Destination ----------
    df["Deck_Destination"] = df["Cabin_Deck"] + "_" + df["Destination"]


    # ---------- 7. Group categorical support features ----------
    group_hp = df.groupby("Group")["HomePlanet"].agg(lambda x: x.mode().iloc[0])
    df["GroupHomePlanet"] = df["Group"].map(group_hp)

    group_dest = df.groupby("Group")["Destination"].agg(lambda x: x.mode().iloc[0])
    df["GroupDestination"] = df["Group"].map(group_dest)

    group_vip = df.groupby("Group")["VIP"].transform("max")
    df["GroupHasVIP"] = group_vip.astype(int)

    return df


In [66]:
train_fe = add_engineered_features(train_processed.copy())
test_fe = add_engineered_features(test_processed.copy())

In [68]:
y = train_fe['Transported'].astype(int)   # Convert True/False to 1/0
X_train_cat = train_fe.drop(columns=['Transported', 'PassengerId', 'Name', 'LastName', 'Deck_Destination'])
X_test_cat = test_fe.drop(columns=['PassengerId', 'Name', 'LastName', 'Deck_Destination'])

# Define categorical columns explicitly
categorical_cols = [
    'HomePlanet',
    'CryoSleep',
    'Cabin',
    'Destination',       # <-- include this!
    'VIP',
    'Cabin_Deck',
    'Cabin_Side',
    'GroupHomePlanet',
    'GroupDestination'
]

# Cast to string
for col in categorical_cols:
    if col in X_train_cat.columns:
        X_train_cat[col] = X_train_cat[col].astype(str)
        X_test_cat[col] = X_test_cat[col].astype(str)

# Get indices for CatBoost
cat_feature_indices = [
    X_train_cat.columns.get_loc(col)
    for col in categorical_cols
    if col in X_train_cat.columns
]

In [69]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
from optuna.samplers import TPESampler

In [73]:
catboost_params = {
    'iterations': 3000,
    'learning_rate': 0.03,
    'depth': 7,
    'l2_leaf_reg': 5,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 100,
    'task_type': 'CPU',          # Switch to 'GPU' if available
    'bootstrap_type': 'Bayesian',
    #'subsample': 0.8,
    'rsm': 0.9,
    'border_count': 128,
    'grow_policy': 'SymmetricTree'
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
catboost_cv_scores = []
catboost_models = []

In [74]:
print("Categorical indices:", cat_feature_indices)
print("Categorical columns:", [X_train_cat.columns[i] for i in cat_feature_indices])


Categorical indices: [0, 1, 2, 3, 5, 12, 14, 36, 37]
Categorical columns: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side', 'GroupHomePlanet', 'GroupDestination']


In [75]:
for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_cat, y), 1):
    X_tr, X_val = X_train_cat.iloc[train_idx], X_train_cat.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)

    model = CatBoostClassifier(**catboost_params)
    model.fit(train_pool, eval_set=val_pool, verbose=False)

    y_pred = model.predict(X_val).flatten()
    fold_score = accuracy_score(y_val, y_pred)
    catboost_cv_scores.append(fold_score)
    catboost_models.append(model)

    print(f"Fold {fold_num}: Accuracy = {fold_score:.4f} (Best iteration: {model.best_iteration_})")

print(f"Mean CV Accuracy: {np.mean(catboost_cv_scores):.4f}")

Fold 1: Accuracy = 0.8125 (Best iteration: 161)
Fold 2: Accuracy = 0.8045 (Best iteration: 264)
Fold 3: Accuracy = 0.8177 (Best iteration: 218)
Fold 4: Accuracy = 0.8176 (Best iteration: 319)
Fold 5: Accuracy = 0.7940 (Best iteration: 66)
Mean CV Accuracy: 0.8093


In [76]:
train_pool_full = Pool(X_train_cat, y, cat_features=cat_feature_indices)
test_pool_cat = Pool(X_test_cat, cat_features=cat_feature_indices)

catboost_final_model = CatBoostClassifier(**catboost_params)
catboost_final_model.fit(train_pool_full, verbose=100)

# Training accuracy
y_train_pred_cat = catboost_final_model.predict(X_train_cat).flatten()
catboost_train_accuracy = accuracy_score(y, y_train_pred_cat)
print(f"Train Accuracy: {catboost_train_accuracy:.4f}")

0:	learn: 0.7579662	total: 25.1ms	remaining: 1m 15s
100:	learn: 0.8154837	total: 2.66s	remaining: 1m 16s
200:	learn: 0.8273323	total: 5.19s	remaining: 1m 12s
300:	learn: 0.8424019	total: 7.82s	remaining: 1m 10s
400:	learn: 0.8522949	total: 10.5s	remaining: 1m 8s
500:	learn: 0.8662142	total: 13.1s	remaining: 1m 5s
600:	learn: 0.8763373	total: 16.1s	remaining: 1m 4s
700:	learn: 0.8849649	total: 18.9s	remaining: 1m 2s
800:	learn: 0.8942828	total: 21.5s	remaining: 59s
900:	learn: 0.9016450	total: 24.2s	remaining: 56.3s
1000:	learn: 0.9078569	total: 26.8s	remaining: 53.6s
1100:	learn: 0.9140688	total: 29.5s	remaining: 50.9s
1200:	learn: 0.9203957	total: 32.1s	remaining: 48.1s
1300:	learn: 0.9258024	total: 34.7s	remaining: 45.3s
1400:	learn: 0.9302887	total: 37.4s	remaining: 42.6s
1500:	learn: 0.9362706	total: 40s	remaining: 39.9s
1600:	learn: 0.9414471	total: 42.6s	remaining: 37.2s
1700:	learn: 0.9447832	total: 45.2s	remaining: 34.5s
1800:	learn: 0.9488094	total: 47.8s	remaining: 31.8s
1900

In [77]:
y_test_pred_cat = catboost_final_model.predict(X_test_cat).flatten()
y_test_pred_cat_bool = y_test_pred_cat.astype(bool)

In [78]:
# Build submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_fe['PassengerId'],
    'Transported': y_test_pred_cat_bool
})
submission.to_csv("submission_catboost_03.csv", index=False)