In [1]:
# ======================================================
# CatBoost Model â†’ Validation + Submission
# ======================================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier

# Load RAW data
train_df = pd.read_csv("/kaggle/input/travel-behavior-insights/train.csv")
test_df  = pd.read_csv("/kaggle/input/travel-behavior-insights/test.csv")

TARGET = "spend_category"
IDCOL = "trip_id"
# Drop rows where target missing
train_df = train_df.dropna(subset=[TARGET]).reset_index(drop=True)

In [2]:
y = train_df[TARGET]
X = train_df.drop(columns=[TARGET])
X_test = test_df.copy()

# Identify categorical + numeric
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
# Fix categorical missing / blank values
for col in cat_cols:
    X[col] = X[col].astype(str).replace(["", " ", "nan"], "Unknown")
    X_test[col] = X_test[col].astype(str).replace(["", " ", "nan"], "Unknown")

# Fix numeric missing
for col in num_cols:
    median_val = X[col].median()
    X[col] = X[col].fillna(median_val)
    X_test[col] = X_test[col].fillna(median_val)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
# CatBoost Model
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.04,
    depth=8,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    cat_features=cat_cols,
    random_seed=42,
    verbose=False
)

cat_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7c8340c40bd0>

In [4]:
# Validation predictions
val_preds = cat_model.predict(X_val)
f1 = f1_score(y_val, val_preds, average="macro")
print("Validation Macro-F1:", round(f1, 4))

Validation Macro-F1: 0.6865


In [5]:
# ==========================
# FINAL TRAIN ON FULL DATA
# ==========================
cat_model.fit(X, y)
# Predict on test set
test_preds = cat_model.predict(X_test)

# FIX: Convert to 1D
test_preds = test_preds.ravel()

# Build submission file
submission = pd.DataFrame({
    IDCOL: test_df[IDCOL],
    TARGET: test_preds
})

save_path = "/kaggle/working/submission.csv"
submission.to_csv(save_path, index=False)

print("\n Submission saved:", save_path)
submission.head()


 Submission saved: /kaggle/working/submission.csv


Unnamed: 0,trip_id,spend_category
0,tour_id8gzpck76,1.0
1,tour_idow1zxkou,0.0
2,tour_idue7esfqz,0.0
3,tour_idnj3mjzpb,0.0
4,tour_ida3us5yk2,0.0
