# CatBoost Model for TDE Classification

This notebook trains a CatBoost model with Optuna hyperparameter tuning.

**Key Features:**
- Uses the same cross-validation folds as all other models
- Handles class imbalance with `scale_pos_weight`
- Saves OOF and test predictions for ensemble

In [9]:
import os
import re
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
import optuna
from sklearn.metrics import precision_recall_curve
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [10]:
# Configuration
N_OPTUNA_TRIALS = 30  # CatBoost is slower, fewer trials
RANDOM_STATE = 15
MODEL_NAME = 'cat'

# Paths
DATA_DIR = os.path.join('..', 'data', 'processed')
MODEL_DIR = os.path.join('..', 'models')
TRAIN_PATH = os.path.join(DATA_DIR, '2dgp_train_features.parquet')
TEST_PATH = os.path.join(DATA_DIR, '2dgp_test_features.parquet')
FOLDS_PATH = os.path.join(DATA_DIR, 'train_folds.csv')

os.makedirs(MODEL_DIR, exist_ok=True)

In [11]:
# Load data
print("Loading data...")
train = pd.read_parquet(TRAIN_PATH)
test = pd.read_parquet(TEST_PATH)
folds = pd.read_csv(FOLDS_PATH)

# Merge folds with training data
train = train.merge(folds[['object_id', 'kfold']], on='object_id', how='left')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Class distribution: {train['target'].value_counts().to_dict()}")

Loading data...
Train shape: (3043, 291)
Test shape: (7135, 289)
Class distribution: {0: 2895, 1: 148}


In [12]:
# Prepare features
drop_cols = ['object_id', 'target', 'split', 'SpecType', 'kfold']
feature_cols = [c for c in train.columns if c not in drop_cols]

X = train[feature_cols].copy()
y = train['target']
kfold = train['kfold']

X_test = test[feature_cols].copy()
object_ids_test = test['object_id']

# Calculate scale_pos_weight for imbalance
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"Feature count: {len(feature_cols)}")
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

Feature count: 288
scale_pos_weight: 19.56


In [17]:
def objective(trial):
    params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
        'depth': trial.suggest_int('depth', 3, 6),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 10, 40),
        'eval_metric': 'PRAUC',
        'early_stopping_rounds': 50,
        'verbose': 0,
        'allow_writing_files': False,
        'random_seed': RANDOM_STATE
    }
    
    f1_scores = []
    for fold in range(5):
        train_idx = kfold != fold
        val_idx = kfold == fold
        
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        clf = CatBoostClassifier(**params)
        clf.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        
        preds = clf.predict_proba(X_val)[:, 1]
        prec, rec, _ = precision_recall_curve(y_val, preds)
        f1 = 2 * (prec * rec) / (prec + rec + 1e-9)
        f1_scores.append(np.max(f1))
    
    return np.mean(f1_scores)

In [18]:
# Run Optuna optimization
print(f"Running Optuna with {N_OPTUNA_TRIALS} trials...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_OPTUNA_TRIALS, show_progress_bar=True)

print(f"\nBest F1 Score: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")

Running Optuna with 30 trials...


  0%|          | 0/30 [00:00<?, ?it/s]


Best F1 Score: 0.6081
Best params: {'learning_rate': 0.030962546779134854, 'depth': 4, 'l2_leaf_reg': 0.7418556543816056, 'random_strength': 0.22720577533754555, 'scale_pos_weight': 14.024393649955671}


In [19]:
# Train final model with best params
print("\nTraining final model with best params...")

best_params = study.best_params.copy()
best_params.update({
    'iterations': 3000,
    'eval_metric': 'PRAUC',
    'early_stopping_rounds': 150,
    'verbose': 0,
    'allow_writing_files': False,
    'random_seed': RANDOM_STATE
})

oof_preds = np.zeros(len(y))
test_preds = np.zeros(len(X_test))

for fold in range(5):
    train_idx = kfold != fold
    val_idx = kfold == fold
    
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    clf = CatBoostClassifier(**best_params)
    clf.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_preds[val_idx] = clf.predict_proba(X_val)[:, 1]
    test_preds += clf.predict_proba(X_test)[:, 1] / 5
    
    print(f"Fold {fold} complete.")

# Calculate final OOF F1
prec, rec, thresh = precision_recall_curve(y, oof_preds)
f1 = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-9)
best_thresh = thresh[np.argmax(f1)]
print(f"\nOOF F1 Score: {np.max(f1):.4f} at threshold {best_thresh:.4f}")


Training final model with best params...
Fold 0 complete.
Fold 1 complete.
Fold 2 complete.
Fold 3 complete.
Fold 4 complete.

OOF F1 Score: 0.5570 at threshold 0.5666


In [20]:
# Save predictions
oof_df = pd.DataFrame({
    'object_id': train['object_id'],
    'target': y,
    f'pred_{MODEL_NAME}': oof_preds
})
oof_df.to_csv(os.path.join(MODEL_DIR, f'oof_{MODEL_NAME}.csv'), index=False)

test_df = pd.DataFrame({
    'object_id': object_ids_test,
    f'pred_{MODEL_NAME}': test_preds
})
test_df.to_csv(os.path.join(MODEL_DIR, f'preds_{MODEL_NAME}.csv'), index=False)

print(f"\nSaved OOF predictions to: models/oof_{MODEL_NAME}.csv")
print(f"Saved test predictions to: models/preds_{MODEL_NAME}.csv")


Saved OOF predictions to: models/oof_cat.csv
Saved test predictions to: models/preds_cat.csv
