In [1]:
#!/usr/bin/env python3
"""
CRITEO ATTRIBUTION MODELING - RANDOM FOREST ONLY
================================================

Random Forest: Robust ensemble method
Production-ready, handles feature interactions, provides feature importance.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import time
import json
import warnings
warnings.filterwarnings('ignore')

print("CRITEO ATTRIBUTION MODELING - RANDOM FOREST")
print("=" * 65)

# Load COMPLETE dataset (ALL ROWS)
print("\nLoading COMPLETE data...")
try:
    df = pd.read_csv('pcb_dataset_final.csv')  # ALL ROWS
    print(f"Loaded {len(df):,} impressions")
except Exception as e:
    print(f"Error: {e}")
    exit(1)

# Basic info
print(f"\nDataset Overview:")
print(f"  Total impressions: {len(df):,}")
print(f"  Attribution rate: {df['attribution'].mean():.1%}")
print(f"  Challenge: Highly imbalanced dataset")

# Feature selection (production-ready)
print(f"\nFeature Engineering...")
features = [
    'campaign', 'cost', 'cpo', 'click',
    'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9'  # Exclude cat7
]

X = df[features].copy()
y = df['attribution'].copy()

print(f"  Selected {len(features)} features")
print(f"  Excluded cat7: {df['cat7'].nunique():,} unique values (too high)")

# Simple feature engineering
campaign_perf = df.groupby('campaign')['attribution'].mean()
X['campaign_perf'] = X['campaign'].map(campaign_perf).fillna(df['attribution'].mean())
X['cost_quartile'] = pd.qcut(X['cost'], q=4, labels=[1,2,3,4]).astype(int)

print(f"  Added campaign performance and cost quartiles")
print(f"  Total features: {X.shape[1]}")

# Prepare data - encode categoricals for Random Forest
categorical_cols = ['campaign'] + [col for col in X.columns if col.startswith('cat')]
X_encoded = X.copy()
for col in categorical_cols:
    if col in X_encoded.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# Split data (stratified for balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Data split: {X_train.shape} train, {X_test.shape} test")

# RANDOM FOREST (Robust Ensemble)
print(f"\nRANDOM FOREST (Robust Ensemble)")
print("-" * 65)

rf = RandomForestClassifier(
    n_estimators=100,  # Balanced for full dataset
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    max_depth=12
)

print("Training Random Forest...")
start_time = time.time()

# Train on encoded data
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

training_time = time.time() - start_time

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False).head(10)

# Results
print(f"\nRESULTS - RANDOM FOREST (COMPLETE DATASET)")
print("-" * 65)
print(f"ROC-AUC:        {roc_auc:.3f}")
print(f"Precision:      {precision:.3f} ({precision:.1%})")
print(f"Recall:         {recall:.3f} ({recall:.1%})")
print(f"F1-Score:       {f1:.3f}")
print(f"Training Time:  {training_time:.1f} seconds")

print(f"\nConfusion Matrix:")
print(f"True Positives:  {tp:,}")
print(f"False Positives: {fp:,}")
print(f"True Negatives:  {tn:,}")
print(f"False Negatives: {fn:,}")
print(f"Correct:         {tp + tn:,} ({(tp + tn) / (tp + tn + fp + fn):.1%})")

print(f"\nTOP 10 FEATURE IMPORTANCE:")
print("-" * 35)
for _, row in feature_importance.iterrows():
    print(f"{row['feature']:<20} {row['importance']:.3f}")

print(f"\nWHY RANDOM FOREST IS INDUSTRY STANDARD:")
print(f"  Robust ensemble: 100 decision trees")
print(f"  Handles interactions between features")
print(f"  Resistant to overfitting (max_depth=12)")
print(f"  Built-in feature importance")
print(f"  Parallel training (n_jobs=-1)")

# Save results
results_df = pd.DataFrame([{
    'Model': 'Random Forest (Complete Dataset)',
    'ROC-AUC': roc_auc,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Training_Time': training_time,
    'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
    'Dataset_Size': len(df)
}])
results_df.to_csv('random_forest_complete_results.csv', index=False)

feature_importance.to_csv('random_forest_feature_importance.csv', index=False)

# Project summary
summary = {
    'project_title': 'Criteo Attribution Modeling - Random Forest (Complete Dataset)',
    'model': 'Random Forest',
    'dataset': {
        'size': f"{len(df):,} impressions",
        'attribution_rate': f"{y.mean():.1%}",
        'features': X.shape[1]
    },
    'performance': {
        'roc_auc': f"{roc_auc:.3f}",
        'precision': f"{precision:.1%}",
        'recall': f"{recall:.1%}",
        'training_time': f"{training_time:.1f}s"
    },
    'top_features': feature_importance.head(5).to_dict('records'),
    'key_achievements': [
        f"Trained on COMPLETE dataset ({len(df):,} rows)",
        f"ROC-AUC: {roc_auc:.3f}",
        f"Fast execution: {training_time:.1f} seconds (parallelized)",
        "Feature importance analysis included",
        "Production-ready ensemble model"
    ]
}

with open('random_forest_complete_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nFiles saved:")
print(f"  random_forest_complete_results.csv")
print(f"  random_forest_feature_importance.csv")
print(f"  random_forest_complete_summary.json")

print(f"\nRANDOM FOREST COMPLETE!")
print("=" * 65)
print(f"Dataset: {len(df):,} COMPLETE impressions")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Precision: {precision:.1%}")
print(f"Recall: {recall:.1%}")
print(f"Training Time: {training_time:.1f}s")
print("=" * 65)

print(f"\nFOR YOUR PROJECT:")
print(f"  Problem: Predict advertising attribution success")
print(f"  Data: {len(df):,} COMPLETE Criteo impressions ({y.mean():.1%} success rate)")
print(f"  Model: Random Forest (robust ensemble)")
print(f"  ROC-AUC: {roc_auc:.3f}")
print(f"  Value: Handles complex interactions, production-ready")


CRITEO ATTRIBUTION MODELING - RANDOM FOREST

Loading COMPLETE data...
Loaded 16,468,027 impressions

Dataset Overview:
  Total impressions: 16,468,027
  Attribution rate: 2.7%
  Challenge: Highly imbalanced dataset

Feature Engineering...
  Selected 12 features
  Excluded cat7: 57,196 unique values (too high)
  Added campaign performance and cost quartiles
  Total features: 14
  Data split: (13174421, 14) train, (3293606, 14) test

RANDOM FOREST (Robust Ensemble)
-----------------------------------------------------------------
Training Random Forest...

RESULTS - RANDOM FOREST (COMPLETE DATASET)
-----------------------------------------------------------------
ROC-AUC:        0.948
Precision:      0.132 (13.2%)
Recall:         0.912 (91.2%)
F1-Score:       0.231
Training Time:  399.3 seconds

Confusion Matrix:
True Positives:  80,709
False Positives: 528,560
True Negatives:  2,676,561
False Negatives: 7,776
Correct:         2,757,270 (83.7%)

TOP 10 FEATURE IMPORTANCE:
---------------