In [1]:
# =================================================
# OOF STACKING: CatBoost + Logistic Regression
# =================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

# Load RAW data
train_df = pd.read_csv("/kaggle/input/travel-behavior-insights/train.csv")
test_df  = pd.read_csv("/kaggle/input/travel-behavior-insights/test.csv")

TARGET = "spend_category"
IDCOL = "trip_id"

In [2]:
train_df = train_df.dropna(subset=[TARGET]).reset_index(drop=True)
y = train_df[TARGET]
X = train_df.drop(columns=[TARGET])
X_test = test_df.copy()

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

# Fill missing values (same as your good version)
for col in cat_cols:
    X[col] = X[col].astype(str).replace(["", " ", "nan"], "Unknown")
    X_test[col] = X_test[col].astype(str).replace(["", " ", "nan"], "Unknown")
for col in num_cols:
    med = X[col].median()
    X[col] = X[col].fillna(med)
    X_test[col] = X_test[col].fillna(med)

In [3]:
# ============================
# OOF Setup
# ============================
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_probs = np.zeros((len(X), len(y.unique())))
test_probs = np.zeros((len(X_test), len(y.unique())))

# ============================
# K-fold CatBoost training
# ============================
for fold, (tr_i, va_i) in enumerate(skf.split(X, y)):
    print(f"\n===== Fold {fold+1}/{FOLDS} =====")

    X_tr, X_va = X.iloc[tr_i], X.iloc[va_i]
    y_tr, y_va = y.iloc[tr_i], y.iloc[va_i]

    model = CatBoostClassifier(
        iterations=1500,
        learning_rate=0.04,
        depth=8,
        eval_metric="TotalF1",
        loss_function="MultiClass",
        cat_features=cat_cols,
        random_seed=fold,
        verbose=200
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_va, y_va))

    # OOF Predictions
    oof_probs[va_i] = model.predict_proba(X_va)
    test_probs += model.predict_proba(X_test) / FOLDS


===== Fold 1/5 =====
0:	learn: 0.7278088	test: 0.7135737	best: 0.7135737 (0)	total: 162ms	remaining: 4m 3s
200:	learn: 0.7812964	test: 0.7457695	best: 0.7462513 (193)	total: 34.9s	remaining: 3m 45s
400:	learn: 0.8119873	test: 0.7483053	best: 0.7483053 (400)	total: 1m 9s	remaining: 3m 10s
600:	learn: 0.8381841	test: 0.7507795	best: 0.7526223 (557)	total: 1m 44s	remaining: 2m 36s
800:	learn: 0.8575902	test: 0.7505927	best: 0.7527481 (765)	total: 2m 19s	remaining: 2m 1s
1000:	learn: 0.8767032	test: 0.7498370	best: 0.7531768 (923)	total: 2m 53s	remaining: 1m 26s
1200:	learn: 0.8944805	test: 0.7515766	best: 0.7531768 (923)	total: 3m 28s	remaining: 51.8s
1400:	learn: 0.9103364	test: 0.7485250	best: 0.7531768 (923)	total: 4m 2s	remaining: 17.2s
1499:	learn: 0.9171696	test: 0.7518924	best: 0.7531768 (923)	total: 4m 19s	remaining: 0us

bestTest = 0.7531768468
bestIteration = 923

Shrink model to first 924 iterations.

===== Fold 2/5 =====
0:	learn: 0.7274185	test: 0.7238045	best: 0.7238045 (0)

In [4]:
# ============================
# Level-2 Model (Logistic Regression)
# ============================
meta = LogisticRegression(
    max_iter=3000, class_weight='balanced', multi_class='multinomial'
)
meta.fit(oof_probs, y)

stack_oof_preds = meta.predict(oof_probs)
stack_f1 = f1_score(y, stack_oof_preds, average='macro')

print("\n Final STACKING Macro-F1:", round(stack_f1, 4))


 Final STACKING Macro-F1: 0.6966


In [5]:
# ============================
# Final test prediction
# ============================
final_preds = meta.predict(test_probs)

submission = pd.DataFrame({
    IDCOL: X_test[IDCOL],
    TARGET: final_preds
})

path = "/kaggle/working/submission.csv"
submission.to_csv(path, index=False)

print("\n Submission saved:", path)
submission.head()


 Submission saved: /kaggle/working/submission.csv


Unnamed: 0,trip_id,spend_category
0,tour_id8gzpck76,2.0
1,tour_idow1zxkou,0.0
2,tour_idue7esfqz,0.0
3,tour_idnj3mjzpb,0.0
4,tour_ida3us5yk2,0.0
