In [1]:
# ======================================================
#  MAX POWER STACKING ENSEMBLE for Spend Category
# Models: CatBoost + GMM + LGBM + XGB + MLP â†’ Logistic Meta
# ======================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# Additional models
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier


# =============================
# Load Data
# =============================
train_raw = pd.read_csv("/kaggle/input/travel-behavior-insights/train.csv")
test_raw = pd.read_csv("/kaggle/input/travel-behavior-insights/test.csv")

train_proc = pd.read_csv("/kaggle/input/preprocess-travel/train_processed.csv")
test_proc  = pd.read_csv("/kaggle/input/preprocess-travel/test_processed.csv")

TARGET = "spend_category"
IDCOL = "trip_id"

# Drop rows where target missing
train_raw = train_raw.dropna(subset=[TARGET]).reset_index(drop=True)

y = train_raw[TARGET]


In [2]:
# =============================
#  CatBoost feature setup
# =============================
X_cat = train_raw.drop(columns=[TARGET])
X_test_cat = test_raw.copy()

cat_cols = X_cat.select_dtypes(include='object').columns.tolist()
num_cols = X_cat.select_dtypes(include=['int64','float64']).columns.tolist()

# Handle categorical NaNs
for col in cat_cols:
    X_cat[col] = X_cat[col].fillna("Unknown")
    X_test_cat[col] = X_test_cat[col].fillna("Unknown")

# Handle numeric NaNs
for col in num_cols:
    median = X_cat[col].median()
    X_cat[col] = X_cat[col].fillna(median)
    X_test_cat[col] = X_test_cat[col].fillna(median)

In [3]:
# =============================
#  Processed data for other models
# =============================
X_num = train_proc.drop(columns=[TARGET, IDCOL])
X_test_num = test_proc.drop(columns=[IDCOL])

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)
X_test_num = scaler.transform(X_test_num)

In [4]:
# =============================
#  GMM Clustering on processed data
# =============================
best_k = 6
gmm = GaussianMixture(n_components=best_k, random_state=42)
gmm.fit(X_num)

train_gmm = gmm.predict_proba(X_num)
test_gmm  = gmm.predict_proba(X_test_num)

In [5]:

# =============================
# K-Fold Stacking
# =============================
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = []
test_preds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_cat, y)):
    print(f"\n Fold {fold+1}/5")

    # CATBOOST
    model_cb = CatBoostClassifier(
        iterations=1200,
        learning_rate=0.04,
        depth=8,
        cat_features=cat_cols,
        loss_function="MultiClass",
        eval_metric="TotalF1",
        random_seed=42,
        verbose=False
    )
    
    model_cb.fit(X_cat.loc[tr_idx], y.loc[tr_idx])
    
    # Out-of-fold probs
    val_cb = model_cb.predict_proba(X_cat.loc[val_idx])
    
    # Other Models input: numeric + GMM
    X_fold_train = np.hstack([X_num[tr_idx], train_gmm[tr_idx]])
    X_fold_val = np.hstack([X_num[val_idx], train_gmm[val_idx]])
    X_test_stack = np.hstack([X_test_num, test_gmm])
    
    # LightGBM
    model_lgb = LGBMClassifier(random_state=42)
    model_lgb.fit(X_fold_train, y.loc[tr_idx])
    val_lgb = model_lgb.predict_proba(X_fold_val)
    
    # XGBoost
    model_xgb = XGBClassifier(
        objective="multi:softprob",
        learning_rate=0.05,
        max_depth=8,
        eval_metric="mlogloss",
        random_state=42
    )
    model_xgb.fit(X_fold_train, y.loc[tr_idx])
    val_xgb = model_xgb.predict_proba(X_fold_val)

    # MLP
    model_mlp = MLPClassifier(hidden_layer_sizes=(256,128),
                              max_iter=200,
                              verbose=False,
                              random_state=42)
    model_mlp.fit(X_fold_train, y.loc[tr_idx])
    val_mlp = model_mlp.predict_proba(X_fold_val)

    # STACK OOF
    fold_meta = np.hstack([
        val_cb, val_lgb, val_xgb, val_mlp
    ])
    oof_preds.append((val_idx, fold_meta))

    # test preds stack
    test_preds.append(np.hstack([
        model_cb.predict_proba(X_test_cat),
        model_lgb.predict_proba(X_test_stack),
        model_xgb.predict_proba(X_test_stack),
        model_mlp.predict_proba(X_test_stack)
    ]))


 Fold 1/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1011
[LightGBM] [Info] Number of data points in the train set: 10096, number of used features: 58
[LightGBM] [Info] Start training from score -0.703502
[LightGBM] [Info] Start training from score -0.944009
[LightGBM] [Info] Start training from score -2.153428

 Fold 2/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1008
[LightGBM] [Info] Number of data points in the train set: 10096, number of used features: 57
[LightGBM] [Info] Start training from score -0.703502
[LightGBM] [Info] Start training from score -0.943754
[L

In [6]:
# =============================
#  Build meta-train matrix
# =============================
oof_matrix = np.zeros((len(train_raw), fold_meta.shape[1]))
for idx, preds in oof_preds:
    oof_matrix[idx] = preds


# =============================
#  Logistic Regression Meta Model
# =============================
meta = LogisticRegression(
    max_iter=5000,
    class_weight='balanced',
    multi_class='multinomial'
)
meta.fit(oof_matrix, y)

oof_final = meta.predict(oof_matrix)
stack_f1 = f1_score(y, oof_final, average="macro")

print("\n FINAL STACKING OOF Macro-F1:", round(stack_f1, 4))


 FINAL STACKING OOF Macro-F1: 0.6987


In [7]:

# =============================
# Final Test Predictions
# =============================
test_stack_matrix = np.mean(test_preds, axis=0)
final_test_pred = meta.predict(test_stack_matrix)

submission = pd.DataFrame({
    IDCOL: test_raw[IDCOL],
    TARGET: final_test_pred
})

save_path = "/kaggle/working/submission.csv"
submission.to_csv(save_path, index=False)

print("\n Submission saved to:", save_path)
submission.head()


 Submission saved to: /kaggle/working/submission.csv


Unnamed: 0,trip_id,spend_category
0,tour_id8gzpck76,2.0
1,tour_idow1zxkou,0.0
2,tour_idue7esfqz,0.0
3,tour_idnj3mjzpb,0.0
4,tour_ida3us5yk2,0.0
