In [3]:
#!/usr/bin/env python3
"""
CRITEO ATTRIBUTION MODELING - LOGISTIC REGRESSION ONLY
======================================================

Logistic Regression: Industry baseline for binary classification
Production-ready, interpretable, fast on full dataset.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import time
import json
import warnings
warnings.filterwarnings('ignore')

print("CRITEO ATTRIBUTION MODELING - LOGISTIC REGRESSION")
print("=" * 65)

# Load COMPLETE dataset (ALL ROWS)
print("\nLoading COMPLETE data...")
try:
    df = pd.read_csv("E:\\PROJECT_F\\F\\criteo_production_ready_data.csv")  # ALL ROWS
    print(f"Loaded {len(df):,} impressions")
except Exception as e:
    print(f"Error: {e}")
    exit(1)


CRITEO ATTRIBUTION MODELING - LOGISTIC REGRESSION

Loading COMPLETE data...
Loaded 16,468,027 impressions


In [4]:
# Basic info
print(f"\nDataset Overview:")
print(f"  Total impressions: {len(df):,}")
print(f"  Attribution rate: {df['attribution'].mean():.1%}")
print(f"  Challenge: Highly imbalanced dataset")

# Feature selection (production-ready)
print(f"\nFeature Engineering...")
features = [
    'campaign', 'cost', 'cpo', 'click',
    'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9'  # Exclude cat7
]

X = df[features].copy()
y = df['attribution'].copy()

print(f"  Selected {len(features)} features")
# print(f"  Excluded cat7: {df['cat7'].nunique():,} unique values (too high)")

# Simple feature engineering
campaign_perf = df.groupby('campaign')['attribution'].mean()
X['campaign_perf'] = X['campaign'].map(campaign_perf).fillna(df['attribution'].mean())
X['cost_quartile'] = pd.qcut(X['cost'], q=4, labels=[1,2,3,4]).astype(int)

print(f"  Added campaign performance and cost quartiles")
print(f"  Total features: {X.shape[1]}")

# Prepare data - encode categoricals for Logistic Regression
categorical_cols = ['campaign'] + [col for col in X.columns if col.startswith('cat')]
X_encoded = X.copy()
for col in categorical_cols:
    if col in X_encoded.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# Split data (stratified for balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# Scale for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"  Data split: {X_train.shape} train, {X_test.shape} test")

# LOGISTIC REGRESSION (Industry Baseline)
print(f"\nLOGISTIC REGRESSION (Industry Standard Baseline)")
print("-" * 65)

log_reg = LogisticRegression(
    random_state=42, 
    max_iter=1000,  # Increased for full dataset
    class_weight='balanced'
)

print("Training Logistic Regression...")
start_time = time.time()

# Train on scaled data
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

training_time = time.time() - start_time

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Results
print(f"\nRESULTS - LOGISTIC REGRESSION (COMPLETE DATASET)")
print("-" * 65)
print(f"ROC-AUC:        {roc_auc:.3f}")
print(f"Precision:      {precision:.3f} ({precision:.1%})")
print(f"Recall:         {recall:.3f} ({recall:.1%})")
print(f"F1-Score:       {f1:.3f}")
print(f"Training Time:  {training_time:.1f} seconds")

print(f"\nConfusion Matrix:")
print(f"True Positives:  {tp:,}")
print(f"False Positives: {fp:,}")
print(f"True Negatives:  {tn:,}")
print(f"False Negatives: {fn:,}")
print(f"Correct:         {tp + tn:,} ({(tp + tn) / (tp + tn + fp + fn):.1%})")

print(f"\nWHY LOGISTIC REGRESSION IS INDUSTRY STANDARD:")
print(f"  Interpretable: Feature coefficients show importance")
print(f"  Fast: Trains on 16M+ rows in seconds")
print(f"  Reliable: Proven baseline for binary classification")
print(f"  Production-ready: Easy to deploy and explain")

# Save results
results_df = pd.DataFrame([{
    'Model': 'Logistic Regression (Complete Dataset)',
    'ROC-AUC': roc_auc,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Training_Time': training_time,
    'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
    'Dataset_Size': len(df)
}])
results_df.to_csv('logistic_regression_complete_results.csv', index=False)

# Project summary
summary = {
    'project_title': 'Criteo Attribution Modeling - Logistic Regression (Complete Dataset)',
    'model': 'Logistic Regression',
    'dataset': {
        'size': f"{len(df):,} impressions",
        'attribution_rate': f"{y.mean():.1%}",
        'features': X.shape[1]
    },
    'performance': {
        'roc_auc': f"{roc_auc:.3f}",
        'precision': f"{precision:.1%}",
        'recall': f"{recall:.1%}",
        'training_time': f"{training_time:.1f}s"
    },
    'key_achievements': [
        f"Trained on COMPLETE dataset ({len(df):,} rows)",
        f"ROC-AUC: {roc_auc:.3f}",
        f"Fast execution: {training_time:.1f} seconds",
        "Production-ready baseline model"
    ]
}

with open('logistic_regression_complete_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nFiles saved:")
print(f"  logistic_regression_complete_results.csv")
print(f"  logistic_regression_complete_summary.json")

print(f"\nLOGISTIC REGRESSION COMPLETE!")
print("=" * 65)
print(f"Dataset: {len(df):,} COMPLETE impressions")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Precision: {precision:.1%}")
print(f"Recall: {recall:.1%}")
print(f"Training Time: {training_time:.1f}s")
print("=" * 65)

print(f"\nFOR YOUR PROJECT:")
print(f"  Problem: Predict advertising attribution success")
print(f"  Data: {len(df):,} COMPLETE Criteo impressions ({y.mean():.1%} success rate)")
print(f"  Model: Logistic Regression (industry baseline)")
print(f"  ROC-AUC: {roc_auc:.3f}")
print(f"  Value: Reliable baseline for production deployment")



Dataset Overview:
  Total impressions: 16,468,027
  Attribution rate: 2.7%
  Challenge: Highly imbalanced dataset

Feature Engineering...
  Selected 12 features
  Added campaign performance and cost quartiles
  Total features: 14
  Data split: (13174421, 14) train, (3293606, 14) test

LOGISTIC REGRESSION (Industry Standard Baseline)
-----------------------------------------------------------------
Training Logistic Regression...

RESULTS - LOGISTIC REGRESSION (COMPLETE DATASET)
-----------------------------------------------------------------
ROC-AUC:        0.933
Precision:      0.106 (10.6%)
Recall:         0.930 (93.0%)
F1-Score:       0.190
Training Time:  19.2 seconds

Confusion Matrix:
True Positives:  82,254
False Positives: 696,516
True Negatives:  2,508,605
False Negatives: 6,231
Correct:         2,590,859 (78.7%)

WHY LOGISTIC REGRESSION IS INDUSTRY STANDARD:
  Interpretable: Feature coefficients show importance
  Fast: Trains on 16M+ rows in seconds
  Reliable: Proven basel