In [1]:
# ======================================================
# STACKING ENSEMBLE → CatBoost + Logistic Regression + SVM
# ======================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# =====================
# Load Datasets
# =====================
raw_train = pd.read_csv("/kaggle/input/travel-behavior-insights/train.csv")
raw_test  = pd.read_csv("/kaggle/input/travel-behavior-insights/test.csv")

proc_train = pd.read_csv("/kaggle/input/preprocess-travel/train_processed.csv")
proc_test  = pd.read_csv("/kaggle/input/preprocess-travel/test_processed.csv")

TARGET = "spend_category"
IDCOL = "trip_id"

In [2]:
# =====================
# Split RAW for CatBoost
# =====================
raw_train = raw_train.dropna(subset=[TARGET]).reset_index(drop=True)

y_raw = raw_train[TARGET]
X_raw = raw_train.drop(columns=[TARGET])
X_raw_test = raw_test.copy()

cat_cols = X_raw.select_dtypes(include='object').columns.tolist()
num_cols = X_raw.select_dtypes(include=['int64','float64']).columns.tolist()

# Fill missing categorical
for col in cat_cols:
    X_raw[col] = X_raw[col].astype(str).replace(["", " ", "nan"], "Unknown")
    X_raw_test[col] = X_raw_test[col].astype(str).replace(["", " ", "nan"], "Unknown")

# Fill missing numeric
for col in num_cols:
    med = X_raw[col].median()
    X_raw[col] = X_raw[col].fillna(med)
    X_raw_test[col] = X_raw_test[col].fillna(med)

Xr_train, Xr_val, yr_train, yr_val = train_test_split(
    X_raw, y_raw, test_size=0.2, stratify=y_raw, random_state=42
)

In [3]:
# =====================
# Split PROCESSED for LR + SVM
# =====================
Xp = proc_train.drop(columns=[TARGET, IDCOL])
yp = proc_train[TARGET]
Xp_test = proc_test.drop(columns=[IDCOL])

Xp_train, Xp_val, yp_train, yp_val = train_test_split(
    Xp, yp, test_size=0.2, stratify=yp, random_state=42
)

In [4]:
# =====================
# Train Individual Models
# =====================

# CatBoost
cat = CatBoostClassifier(
    iterations=1000, learning_rate=0.04, depth=8,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    cat_features=cat_cols,
    verbose=False
)
cat.fit(Xr_train, yr_train)
cat_val_pred = cat.predict_proba(Xr_val)

# Logistic Regression
lr = LogisticRegression(max_iter=2000, class_weight='balanced', multi_class='multinomial')
lr.fit(Xp_train, yp_train)
lr_val_pred = lr.predict_proba(Xp_val)

# SVM
svm = SVC(kernel="rbf", C=3, class_weight="balanced", probability=True)
svm.fit(Xp_train, yp_train)
svm_val_pred = svm.predict_proba(Xp_val)

In [5]:
# =====================
# STACKING
# =====================

# Align rows → only use LR/SVM validation rows that match raw validation index
stack_val = pd.DataFrame(
    np.hstack([cat_val_pred, lr_val_pred, svm_val_pred]),
    index=yp_val.index
)

stacker = LogisticRegression(max_iter=2000, class_weight='balanced', multi_class='multinomial')
stacker.fit(stack_val, yp_val)

stack_val_pred = stacker.predict(stack_val)
stack_f1 = f1_score(yp_val, stack_val_pred, average="macro")

print(f"\n Stacked Ensemble Validation F1: {stack_f1:.4f}")


 Stacked Ensemble Validation F1: 0.7030


In [6]:
# =====================
# FINAL SUBMISSION
# =====================

# Retrain CatBoost on FULL RAW TRAIN
cat.fit(X_raw, y_raw)
cat_test_pred = cat.predict_proba(X_raw_test)

# Predict LR and SVM on FULL PROCESSED TRAIN
lr.fit(Xp, yp)
lr_test_pred = lr.predict_proba(Xp_test)

svm.fit(Xp, yp)
svm_test_pred = svm.predict_proba(Xp_test)

# Combine test meta features
stack_test = np.hstack([cat_test_pred, lr_test_pred, svm_test_pred])

# Final prediction
final_preds = stacker.predict(stack_test)

submission = pd.DataFrame({
    IDCOL: raw_test[IDCOL],
    TARGET: final_preds
})

save_path = "/kaggle/working/submission.csv"
submission.to_csv(save_path, index=False)

print("\n Final Stacked Submission saved:", save_path)
submission.head()


 Final Stacked Submission saved: /kaggle/working/submission.csv


Unnamed: 0,trip_id,spend_category
0,tour_id8gzpck76,2.0
1,tour_idow1zxkou,0.0
2,tour_idue7esfqz,0.0
3,tour_idnj3mjzpb,0.0
4,tour_ida3us5yk2,0.0
