In [1]:
"""
CRITEO ATTRIBUTION MODELING - CATBOOST ONLY
===========================================

CatBoost: Auto-handles categories, excellent out-of-the-box performance.
Trained on COMPLETE dataset.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from catboost import CatBoostClassifier
import time
import json
import warnings
warnings.filterwarnings('ignore')

print("CRITEO ATTRIBUTION MODELING - CATBOOST")
print("=" * 65)

# Load COMPLETE dataset (ALL ROWS)
print("\nLoading COMPLETE data...")
try:
    df = pd.read_csv('E:\\PROJECT_F\\F\\criteo_production_ready_data.csv')  # ALL ROWS
    print(f"Loaded {len(df):,} impressions")
except Exception as e:
    print(f"Error: {e}")
    exit(1)

# Basic info
print(f"\nDataset Overview:")
print(f"  Total impressions: {len(df):,}")
print(f"  Attribution rate: {df['attribution'].mean():.1%}")
print(f"  Challenge: Highly imbalanced dataset")

# Feature selection (production-ready)
print(f"\nFeature Engineering...")
features = [
    'campaign', 'cost', 'cpo', 'click',
    'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9'  # Exclude cat7
]

X = df[features].copy()
y = df['attribution'].copy()

print(f"  Selected {len(features)} features")
# print(f"  Excluded cat7: {df['cat7'].nunique():,} unique values (too high)")

# Simple feature engineering
campaign_perf = df.groupby('campaign')['attribution'].mean()
X['campaign_perf'] = X['campaign'].map(campaign_perf).fillna(df['attribution'].mean())
X['cost_quartile'] = pd.qcut(X['cost'], q=4, labels=[1,2,3,4]).astype(int)

print(f"  Added campaign performance and cost quartiles")
print(f"  Total features: {X.shape[1]}")

# Split data (stratified for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Data split: {X_train.shape} train, {X_test.shape} test")

# CATBOOST (Auto-handles Categories - NO ENCODING NEEDED!)
print(f"\nCATBOOST (Auto-handles Categories)")
print("-" * 65)

# CatBoost categorical features indices
cat_features = [i for i, col in enumerate(X.columns) 
                if col in ['campaign', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9']]

catboost_model = CatBoostClassifier(
    iterations=100,          # Balanced for full dataset
    random_seed=42,
    verbose=False,
    auto_class_weights='Balanced',
    cat_features=cat_features,
    depth=8,
    learning_rate=0.1
)

print("Training CatBoost...")
start_time = time.time()

# Train on TRAIN data
catboost_model.fit(X_train, y_train)

# Predictions on TEST
y_pred = catboost_model.predict(X_test)
y_prob = catboost_model.predict_proba(X_test)[:, 1]

training_time = time.time() - start_time

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': catboost_model.get_feature_importance()
}).sort_values('importance', ascending=False).head(10)

# Results
print(f"\nRESULTS - CATBOOST (COMPLETE DATASET)")
print("-" * 65)
print(f"ROC-AUC:        {roc_auc:.3f}")
print(f"Precision:      {precision:.3f} ({precision:.1%})")
print(f"Recall:         {recall:.3f} ({recall:.1%})")
print(f"F1-Score:       {f1:.3f}")
print(f"Training Time:  {training_time:.1f} seconds")


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# --------------------------------------------------
# CONFUSION MATRIX (CUSTOM FORMAT)
# --------------------------------------------------
total = tp + tn + fp + fn
correct = tp + tn
accuracy = correct / total

print("\nConfusion Matrix:")
print(f"True Positives:  {tp:,}")
print(f"False Positives: {fp:,}")
print(f"True Negatives:  {tn:,}")
print(f"False Negatives: {fn:,}")
print(f"Correct:         {correct:,} ({accuracy:.1%})")


print(f"\nTOP 10 FEATURE IMPORTANCE:")
print("-" * 35)
for _, row in feature_importance.iterrows():
    print(f"{row['feature']:<20} {row['importance']:.1f}")

print(f"\nFiles ready for next cell (model saved below)...")


CRITEO ATTRIBUTION MODELING - CATBOOST

Loading COMPLETE data...
Loaded 16,468,027 impressions

Dataset Overview:
  Total impressions: 16,468,027
  Attribution rate: 2.7%
  Challenge: Highly imbalanced dataset

Feature Engineering...
  Selected 12 features
  Added campaign performance and cost quartiles
  Total features: 14
  Data split: (13174421, 14) train, (3293606, 14) test

CATBOOST (Auto-handles Categories)
-----------------------------------------------------------------
Training CatBoost...

RESULTS - CATBOOST (COMPLETE DATASET)
-----------------------------------------------------------------
ROC-AUC:        0.947
Precision:      0.128 (12.8%)
Recall:         0.913 (91.3%)
F1-Score:       0.224
Training Time:  325.2 seconds

Confusion Matrix:
True Positives:  80,821
False Positives: 552,734
True Negatives:  2,652,387
False Negatives: 7,664
Correct:         2,733,208 (83.0%)

TOP 10 FEATURE IMPORTANCE:
-----------------------------------
click                90.6
campaign_perf 

In [2]:
# =============================================================================
# SAVE TRAINED CATBOOST MODEL FOR PRODUCTION
# =============================================================================

import pickle
from catboost import CatBoost

print("SAVING YOUR TRAINED CATBOOST MODEL...")
print("=" * 50)

# 1. Native CatBoost format (FASTEST)
catboost_model.save_model('catboost_trained_model.cbm')
print("✅ Saved: catboost_trained_model.cbm (native FAST)")

# 2. Pickle format
with open('catboost_trained_model.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)
print("✅ Saved: catboost_trained_model.pkl")

# 3. Model info (NO encoders needed!)
model_info = {
    'features': X.columns.tolist(),
    'cat_features_indices': cat_features,
    'target': 'attribution',
    'trained_on_rows': len(df),
    'roc_auc': float(roc_auc),
    'precision': float(precision),
    'recall': float(recall)
}

import json
with open('catboost_model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)
print("✅ Saved: catboost_model_info.json")

print("\nPRODUCTION READY!")
print("Files created:")
print("  catboost_trained_model.cbm")
print("  catboost_trained_model.pkl") 
print("  catboost_model_info.json")
print(f"Model: ROC-AUC {roc_auc:.3f} on {len(df):,} rows")


SAVING YOUR TRAINED CATBOOST MODEL...
✅ Saved: catboost_trained_model.cbm (native FAST)
✅ Saved: catboost_trained_model.pkl
✅ Saved: catboost_model_info.json

PRODUCTION READY!
Files created:
  catboost_trained_model.cbm
  catboost_trained_model.pkl
  catboost_model_info.json
Model: ROC-AUC 0.947 on 16,468,027 rows
