In [None]:
#!/usr/bin/env python3
"""
CRITEO ATTRIBUTION MODELING - XGBOOST ONLY
==========================================

XGBoost: State-of-the-art gradient boosting
Competition winner, excellent for structured data, production favorite.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb
import time
import json
import warnings
warnings.filterwarnings('ignore')

print("CRITEO ATTRIBUTION MODELING - XGBOOST")
print("=" * 65)

# Load COMPLETE dataset (ALL ROWS)
print("\nLoading COMPLETE data...")
try:
    df = pd.read_csv('pcb_dataset_final.csv')  # ALL ROWS
    print(f"Loaded {len(df):,} impressions")
except Exception as e:
    print(f"Error: {e}")
    exit(1)

# Basic info
print(f"\nDataset Overview:")
print(f"  Total impressions: {len(df):,}")
print(f"  Attribution rate: {df['attribution'].mean():.1%}")
print(f"  Challenge: Highly imbalanced dataset")

# Feature selection (production-ready)
print(f"\nFeature Engineering...")
features = [
    'campaign', 'cost', 'cpo', 'click',
    'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9'  # Exclude cat7
]

X = df[features].copy()
y = df['attribution'].copy()

print(f"  Selected {len(features)} features")
print(f"  Excluded cat7: {df['cat7'].nunique():,} unique values (too high)")

# Simple feature engineering
campaign_perf = df.groupby('campaign')['attribution'].mean()
X['campaign_perf'] = X['campaign'].map(campaign_perf).fillna(df['attribution'].mean())
X['cost_quartile'] = pd.qcut(X['cost'], q=4, labels=[1,2,3,4]).astype(int)

print(f"  Added campaign performance and cost quartiles")
print(f"  Total features: {X.shape[1]}")

# Prepare data - encode categoricals for XGBoost
categorical_cols = ['campaign'] + [col for col in X.columns if col.startswith('cat')]
X_encoded = X.copy()
for col in categorical_cols:
    if col in X_encoded.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# Split data (stratified for balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Data split: {X_train.shape} train, {X_test.shape} test")

# XGBOOST (State-of-the-art Gradient Boosting)
print(f"\nXGBOOST (State-of-the-art Gradient Boosting)")
print("-" * 65)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # Balanced for full dataset
    random_state=42,
    eval_metric='logloss',
    verbosity=0,
    scale_pos_weight=(y==0).sum()/(y.sum()+1),  # Handle imbalance
    max_depth=8,
    learning_rate=0.1,
    n_jobs=-1
)

print("Training XGBoost...")
start_time = time.time()

# Train on encoded data
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Predictions
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

training_time = time.time() - start_time

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

# Results
print(f"\nRESULTS - XGBOOST (COMPLETE DATASET)")
print("-" * 65)
print(f"ROC-AUC:        {roc_auc:.3f}")
print(f"Precision:      {precision:.3f} ({precision:.1%})")
print(f"Recall:         {recall:.3f} ({recall:.1%})")
print(f"F1-Score:       {f1:.3f}")
print(f"Training Time:  {training_time:.1f} seconds")

print(f"\nConfusion Matrix:")
print(f"True Positives:  {tp:,}")
print(f"False Positives: {fp:,}")
print(f"True Negatives:  {tn:,}")
print(f"False Negatives: {fn:,}")
print(f"Correct:         {tp + tn:,} ({(tp + tn) / (tp + tn + fp + fn):.1%})")

print(f"\nTOP 10 FEATURE IMPORTANCE:")
print("-" * 35)
for _, row in feature_importance.iterrows():
    print(f"{row['feature']:<20} {row['importance']:.3f}")

print(f"\nWHY XGBOOST IS INDUSTRY STANDARD:")
print(f"  Competition winner (Kaggle champion)")
print(f"  Gradient boosting with regularization")
print(f"  Handles imbalanced data (scale_pos_weight)")
print(f"  Parallel training (n_jobs=-1)")
print(f"  Built-in early stopping and validation")

# Save results
results_df = pd.DataFrame([{
    'Model': 'XGBoost (Complete Dataset)',
    'ROC-AUC': roc_auc,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Training_Time': training_time,
    'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
    'Dataset_Size': len(df)
}])
results_df.to_csv('xgboost_complete_results.csv', index=False)

feature_importance.to_csv('xgboost_feature_importance.csv', index=False)

# Project summary
summary = {
    'project_title': 'Criteo Attribution Modeling - XGBoost (Complete Dataset)',
    'model': 'XGBoost',
    'dataset': {
        'size': f"{len(df):,} impressions",
        'attribution_rate': f"{y.mean():.1%}",
        'features': X.shape[1]
    },
    'performance': {
        'roc_auc': f"{roc_auc:.3f}",
        'precision': f"{precision:.1%}",
        'recall': f"{recall:.1%}",
        'training_time': f"{training_time:.1f}s"
    },
    'top_features': feature_importance.head(5).to_dict('records'),
    'key_achievements': [
        f"Trained on COMPLETE dataset ({len(df):,} rows)",
        f"ROC-AUC: {roc_auc:.3f}",
        f"Fast execution: {training_time:.1f} seconds (parallelized)",
        "Feature importance analysis included",
        "State-of-the-art gradient boosting"
    ]
}

with open('xgboost_complete_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nFiles saved:")
print(f"  xgboost_complete_results.csv")
print(f"  xgboost_feature_importance.csv")
print(f"  xgboost_complete_summary.json")

print(f"\nXGBOOST COMPLETE!")
print("=" * 65)
print(f"Dataset: {len(df):,} COMPLETE impressions")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Precision: {precision:.1%}")
print(f"Recall: {recall:.1%}")
print(f"Training Time: {training_time:.1f}s")
print("=" * 65)

print(f"\nFOR YOUR PROJECT:")
print(f"  Problem: Predict advertising attribution success")
print(f"  Data: {len(df):,} COMPLETE Criteo impressions ({y.mean():.1%} success rate)")
print(f"  Model: XGBoost (state-of-the-art gradient boosting)")
print(f"  ROC-AUC: {roc_auc:.3f}")
print(f"  Value: Competition-winning performance, production scalable")


CRITEO ATTRIBUTION MODELING - XGBOOST

Loading COMPLETE data...
Loaded 16,468,027 impressions

Dataset Overview:
  Total impressions: 16,468,027
  Attribution rate: 2.7%
  Challenge: Highly imbalanced dataset

Feature Engineering...
  Selected 12 features
  Excluded cat7: 57,196 unique values (too high)
  Added campaign performance and cost quartiles
  Total features: 14
  Data split: (13174421, 14) train, (3293606, 14) test

XGBOOST (State-of-the-art Gradient Boosting)
-----------------------------------------------------------------
Training XGBoost...


In [None]:
# =============================================================================
# SAVE TRAINED XGBOOST MODEL FOR PRODUCTION
# =============================================================================

import pickle
import joblib

print("SAVING YOUR TRAINED XGBOOST MODEL...")
print("=" * 50)

# 1. Save the trained model (xgb_model from previous cell)
with open('xgboost_trained_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print("âœ… Saved: xgboost_trained_model.pkl")

# 2. Alternative: XGBoost native format (faster)
xgb_model.save_model('xgboost_trained_model.json')
print("âœ… Saved: xgboost_trained_model.json (native FAST)")

# 3. Save feature names and preprocessing info
model_info = {
    'features': X_encoded.columns.tolist(),
    'target': 'attribution',
    'trained_on_rows': len(df),
    'roc_auc': roc_auc,
    'precision': float(precision),
    'recall': float(recall)
}

import json
with open('xgboost_model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)
print("âœ… Saved: xgboost_model_info.json")

# 4. PRODUCTION PREDICTION FUNCTION (Ready to copy!)
print("\n" + "="*60)
print("PRODUCTION PREDICTION CODE (Copy this for deployment):")
print("="*60)

pred_code = '''
# === PRODUCTION USE ===
import pickle
import xgboost as xgb
import pandas as pd
import numpy as np

# Load model
model = pickle.load(open("xgboost_trained_model.pkl", "rb"))
# OR FASTER: model = xgb.XGBClassifier(); model.load_model("xgboost_trained_model.json")

# Load feature info
import json
with open("xgboost_model_info.json") as f:
    info = json.load(f)

def predict_new_data(new_df):
    """Predict attribution for NEW impressions"""
    # Apply SAME preprocessing as training
    X_new = new_df[info["features"]].copy()
    
    # Recreate engineered features (campaign_perf, cost_quartile)
    campaign_perf = new_df.groupby("campaign")["attribution"].mean()
    X_new["campaign_perf"] = new_df["campaign"].map(campaign_perf).fillna(0.032)
    X_new["cost_quartile"] = pd.qcut(new_df["cost"], q=4, labels=[1,2,3,4]).astype(int)
    
    # Encode categoricals (you need to recreate encoders or use same method)
    for col in ["campaign", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat8", "cat9"]:
        if col in X_new.columns:
            le = pd.Series(X_new[col].astype(str)).astype('category').cat.codes
            X_new[col] = le
    
    # Predict!
    probs = model.predict_proba(X_new)[:, 1]
    return probs

# Example:
# new_ads = pd.read_csv("new_campaign_data.csv")
# predictions = predict_new_data(new_ads)
# print(f"Avg attribution probability: {predictions.mean():.3f}")
'''

print(pred_code)
print("="*60)

print(f"\nðŸŽ‰ MODEL SAVED SUCCESSFULLY!")
print(f"Files created:")
print(f"  xgboost_trained_model.pkl")
print(f"  xgboost_trained_model.json (native)")
print(f"  xgboost_model_info.json")
print(f"Model trained on: {len(df):,} rows | ROC-AUC: {roc_auc:.3f}")
