In [1]:
#!/usr/bin/env python3
"""
CRITEO ATTRIBUTION MODELING - LIGHTGBM ONLY
===========================================

LightGBM: Fast, memory-efficient gradient boosting.
Production favorite, handles large datasets perfectly.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
import lightgbm as lgb
import time
import json
import warnings
warnings.filterwarnings('ignore')

print("CRITEO ATTRIBUTION MODELING - LIGHTGBM")
print("=" * 65)

# Load COMPLETE dataset (ALL ROWS)
print("\nLoading COMPLETE data...")
try:
    df = pd.read_csv('E:\\PROJECT_F\\F\\criteo_production_ready_data.csv')  # ALL ROWS
    print(f"Loaded {len(df):,} impressions")
except Exception as e:
    print(f"Error: {e}")
    exit(1)

# Basic info
print(f"\nDataset Overview:")
print(f"  Total impressions: {len(df):,}")
print(f"  Attribution rate: {df['attribution'].mean():.1%}")
print(f"  Challenge: Highly imbalanced dataset")

# Feature selection (production-ready)
print(f"\nFeature Engineering...")
features = [
    'campaign', 'cost', 'cpo', 'click',
    'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9'  # Exclude cat7
]

X = df[features].copy()
y = df['attribution'].copy()

print(f"  Selected {len(features)} features")
#print(f"  Excluded cat7: {df['cat7'].nunique():,} unique values (too high)")

# Simple feature engineering
campaign_perf = df.groupby('campaign')['attribution'].mean()
X['campaign_perf'] = X['campaign'].map(campaign_perf).fillna(df['attribution'].mean())
X['cost_quartile'] = pd.qcut(X['cost'], q=4, labels=[1,2,3,4]).astype(int)

print(f"  Added campaign performance and cost quartiles")
print(f"  Total features: {X.shape[1]}")

# Encode categoricals for LightGBM
categorical_cols = ['campaign'] + [col for col in X.columns if col.startswith('cat')]
X_encoded = X.copy()
for col in categorical_cols:
    if col in X_encoded.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# Split data (stratified for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Data split: {X_train.shape} train, {X_test.shape} test")

# LIGHTGBM (Fast & Memory Efficient)
print(f"\nLIGHTGBM (Fast Gradient Boosting)")
print("-" * 65)

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,        # Balanced for full dataset
    random_state=42,
    verbose=-1,
    class_weight='balanced',
    max_depth=8,
    learning_rate=0.1,
    num_leaves=31,
    n_jobs=-1,
    force_col_wise=True      # Memory optimization
)

print("Training LightGBM...")
start_time = time.time()

# Train
lgb_model.fit(X_train, y_train)

# Predictions
y_pred = lgb_model.predict(X_test)
y_prob = lgb_model.predict_proba(X_test)[:, 1]

training_time = time.time() - start_time

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

# Results
print(f"\nRESULTS - LIGHTGBM (COMPLETE DATASET)")
print("-" * 65)
print(f"ROC-AUC:        {roc_auc:.3f}")
print(f"Precision:      {precision:.3f} ({precision:.1%})")
print(f"Recall:         {recall:.3f} ({recall:.1%})")
print(f"F1-Score:       {f1:.3f}")
print(f"Training Time:  {training_time:.1f} seconds")

# --------------------------------------------------
# CONFUSION MATRIX (CUSTOM FORMAT)
# --------------------------------------------------
total = tp + tn + fp + fn
correct = tp + tn
accuracy = correct / total

print("\nConfusion Matrix:")
print(f"True Positives:  {tp:,}")
print(f"False Positives: {fp:,}")
print(f"True Negatives:  {tn:,}")
print(f"False Negatives: {fn:,}")
print(f"Correct:         {correct:,} ({accuracy:.1%})")


print(f"\nTOP 10 FEATURE IMPORTANCE:")
print("-" * 35)
for _, row in feature_importance.iterrows():
    print(f"{row['feature']:<20} {row['importance']:.1f}")

print(f"\nFiles ready for next cell (model saved below)...")


CRITEO ATTRIBUTION MODELING - LIGHTGBM

Loading COMPLETE data...
Loaded 16,468,027 impressions

Dataset Overview:
  Total impressions: 16,468,027
  Attribution rate: 2.7%
  Challenge: Highly imbalanced dataset

Feature Engineering...
  Selected 12 features
  Added campaign performance and cost quartiles
  Total features: 14
  Data split: (13174421, 14) train, (3293606, 14) test

LIGHTGBM (Fast Gradient Boosting)
-----------------------------------------------------------------
Training LightGBM...

RESULTS - LIGHTGBM (COMPLETE DATASET)
-----------------------------------------------------------------
ROC-AUC:        0.949
Precision:      0.136 (13.6%)
Recall:         0.911 (91.1%)
F1-Score:       0.236
Training Time:  36.4 seconds

Confusion Matrix:
True Positives:  80,608
False Positives: 513,230
True Negatives:  2,691,891
False Negatives: 7,877
Correct:         2,772,499 (84.2%)

TOP 10 FEATURE IMPORTANCE:
-----------------------------------
cat1                 408.0
cat3           

In [2]:
# =============================================================================
# SAVE TRAINED LIGHTGBM MODEL FOR PRODUCTION
# =============================================================================

import pickle
import joblib
import lightgbm as lgb

print("SAVING YOUR TRAINED LIGHTGBM MODEL...")
print("=" * 50)

# 1. Save as pickle
with open('lightgbm_trained_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)
print("✅ Saved: lightgbm_trained_model.pkl")

# 2. Save LightGBM native booster (FASTEST)
booster = lgb_model.booster_
booster.save_model('lightgbm_trained_model.txt')
print("✅ Saved: lightgbm_trained_model.txt (native FAST)")

# 3. Model info
model_info = {
    'features': X_encoded.columns.tolist(),
    'target': 'attribution',
    'trained_on_rows': len(df),
    'roc_auc': float(roc_auc),
    'precision': float(precision),
    'recall': float(recall)
}

import json
with open('lightgbm_model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)
print("✅ Saved: lightgbm_model_info.json")

print("\nPRODUCTION READY!")
print("Files created:")
print("  lightgbm_trained_model.pkl")
print("  lightgbm_trained_model.txt (native)")
print("  lightgbm_model_info.json")
print(f"Model: ROC-AUC {roc_auc:.3f} on {len(df):,} rows")


SAVING YOUR TRAINED LIGHTGBM MODEL...
✅ Saved: lightgbm_trained_model.pkl
✅ Saved: lightgbm_trained_model.txt (native FAST)
✅ Saved: lightgbm_model_info.json

PRODUCTION READY!
Files created:
  lightgbm_trained_model.pkl
  lightgbm_trained_model.txt (native)
  lightgbm_model_info.json
Model: ROC-AUC 0.949 on 16,468,027 rows
