In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, auc, classification_report

import tensorflow as tf

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score




from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

from sklearn.metrics import precision_score, recall_score, f1_score



In [2]:
df = pd.read_csv('processed_data/merged_df.csv')

In [None]:
# split

target_column = 'netcontractsigned'
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler() # important for logistic regressor
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a simple logistic regression (NO imbalance handling yet)
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Get predictions
y_pred_proba = lr.predict_proba(X_test_scaled)[:, 1]

# Check if there's any signal
auc_score = roc_auc_score(y_test, y_pred_proba)

print("\n=== SIGNAL TEST RESULTS ===")
print(f"AUC-ROC Score: {auc_score:.3f}")
if auc_score > 0.5:
    print("✅ GOOD NEWS: Your features contain useful signal!")
    print("We can proceed with imbalance fixing strategies.")
else:
    print("❌ BAD NEWS: No signal detected. Features need work first.")



In [None]:


# Calculate AUC-PR (more reliable for imbalanced data)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auc_pr = auc(recall, precision)

# Random baseline for comparison
random_baseline = y_test.mean()  # 0.023

print("\n=== DETAILED SIGNAL ANALYSIS ===")
print(f"AUC-ROC: {auc_score:.3f}")
print(f"AUC-PR: {auc_pr:.3f}")
print(f"Random baseline AUC-PR: {random_baseline:.3f}")
print(f"Improvement over random: {auc_pr/random_baseline:.1f}x")

# Look at actual predictions with default 0.5 threshold
y_pred = (y_pred_proba >= 0.5).astype(int)
print(f"\nWith 0.5 threshold:")
print(f"Predicted positives: {y_pred.sum()}")
print(f"Actual positives: {y_test.sum()}")

# Check top 10% of predictions
top_10_percent_threshold = np.percentile(y_pred_proba, 90)
print(f"\nTop 10% analysis:")
print(f"Threshold for top 10%: {top_10_percent_threshold:.3f}")
top_10_mask = y_pred_proba >= top_10_percent_threshold
print(f"Conversion rate in top 10%: {y_test[top_10_mask].mean():.3f}")
print(f"That's {y_test[top_10_mask].mean()/y_test.mean():.1f}x better than average!")

#### First Strategy

Class Weights

In [None]:

# Test class weights with your strong signal
print("\n=== TESTING CLASS WEIGHTS ===")

# Calculate balanced class weight
pos_weight = (1 - y_train.mean()) / y_train.mean()  # About 42 for your data
print(f"Calculated positive class weight: {pos_weight:.1f}")

# Train with class weights
lr_weighted = LogisticRegression(
    class_weight={0: 1, 1: pos_weight},
    random_state=42, 
    max_iter=1000
)
lr_weighted.fit(X_train_scaled, y_train)

# Get predictions
y_pred_weighted = lr_weighted.predict_proba(X_test_scaled)[:, 1]

# Evaluate
precision_w, recall_w, _ = precision_recall_curve(y_test, y_pred_weighted)
auc_pr_weighted = auc(recall_w, precision_w)
auc_roc_weighted = roc_auc_score(y_test, y_pred_weighted)

print(f"\nClass Weights Results:")
print(f"AUC-ROC: {auc_roc_weighted:.3f}")
print(f"AUC-PR: {auc_pr_weighted:.3f}")
print(f"Improvement over baseline: {auc_pr_weighted/auc_pr:.2f}x")

# Check different thresholds
for thresh in [0.1, 0.2, 0.3]:
    pred_at_thresh = (y_pred_weighted >= thresh).astype(int)
    if pred_at_thresh.sum() > 0:
        precision_at_thresh = (y_test[pred_at_thresh == 1]).mean()
        print(f"At {thresh} threshold: {pred_at_thresh.sum()} predicted positive, {precision_at_thresh:.3f} precision")

NameError: name 'recall' is not defined

Class weights actually made things slightly worse (AUC-PR dropped from 0.174 to 0.158)

In [None]:
# checking the difference thresholds make

# Use the original (better) model predictions
print("=== THRESHOLD OPTIMIZATION ===")
print("Using original model (no class weights)")

# Try many different thresholds
thresholds = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
results = []

for thresh in thresholds:
    pred_at_thresh = (y_pred_proba >= thresh).astype(int)
    
    if pred_at_thresh.sum() > 0:
        precision = (y_test[pred_at_thresh == 1]).mean()
        recall = (y_test * pred_at_thresh).sum() / y_test.sum()
        
        if precision > 0 and recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0
            
        results.append({
            'threshold': thresh,
            'predicted_positive': pred_at_thresh.sum(),
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'percent_flagged': pred_at_thresh.sum() / len(y_test) * 100
        })

# Show results
for r in results:
    print(f"Threshold {r['threshold']:.2f}: {r['predicted_positive']:4d} flagged ({r['percent_flagged']:4.1f}%), "
          f"Precision {r['precision']:.3f}, Recall {r['recall']:.3f}, F1 {r['f1']:.3f}")

#### Second Strategy

Focal Loss

In [None]:

class FocalLoss(tf.keras.losses.Loss):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def call(self, y_true, y_pred):
        ce_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_t = y_true * self.alpha + (1 - y_true) * (1 - self.alpha)
        focal_loss = alpha_t * tf.pow((1 - p_t), self.gamma) * ce_loss
        return tf.reduce_mean(focal_loss)

In [None]:
# Check what metrics are actually available
print("Available metrics:", list(history.history.keys()))

# Get predictions and calculate F1
val_pred_probs = model.predict(X_val)
val_pred_binary = (val_pred_probs > 0.07).astype(int)



f1 = f1_score(y_val, val_pred_binary)
precision = precision_score(y_val, val_pred_binary)
recall = recall_score(y_val, val_pred_binary)
auc = roc_auc_score(y_val, val_pred_probs)

print(f"F1 Score: {f1:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"AUC: {auc:.3f}")

#### Third Strategy

Smoteen

In [None]:
# Configure SMOTE-ENN
smote_enn = SMOTEENN(
    smote=SMOTE(random_state=42, k_neighbors=3),  # Reduced neighbors for small minority class
    enn=EditedNearestNeighbours(n_neighbors=3),
    random_state=42
)

# Apply resampling
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE-ENN: {X_train_scaled.shape}, {y_train.value_counts().to_dict()}")
print(f"After SMOTE-ENN: {X_train_resampled.shape}, {pd.Series(y_train_resampled).value_counts().to_dict()}")

# Calculate the change
original_pos = y_train.sum()
resampled_pos = pd.Series(y_train_resampled).sum()
original_neg = len(y_train) - original_pos
resampled_neg = len(y_train_resampled) - resampled_pos

print(f"\nPositive samples: {original_pos} → {resampled_pos} (added {resampled_pos - original_pos})")
print(f"Negative samples: {original_neg} → {resampled_neg} (removed {original_neg - resampled_neg})")

# 5. CREATE MODEL
def create_model(input_dim, dropout_rate=0.3):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall', 'AUC']
    )
    
    return model

# 6. TRAIN MODEL WITH SMOTE-ENN DATA
print("\n" + "="*50)
print("TRAINING MODEL WITH SMOTE-ENN")
print("="*50)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=1e-7,
    verbose=1
)

model_smoteenn = create_model(X_train_resampled.shape[1])

# Train on resampled data
history_smoteenn = model_smoteenn.fit(
    X_train_resampled, y_train_resampled,
    epochs=100,
    batch_size=32,
    validation_split=0.2,  # Split from resampled data
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# 7. EVALUATE ON ORIGINAL TEST SET
print("\n" + "="*50)
print("EVALUATION ON ORIGINAL TEST SET")
print("="*50)

y_test_pred_proba = model_smoteenn.predict(X_test_scaled)
y_test_pred_proba = y_test_pred_proba.flatten()

# Calculate PR-AUC and ROC-AUC
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_pred_proba)
pr_auc = auc(recall_vals, precision_vals)
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
random_baseline = y_test.sum() / len(y_test)

print(f"AUC-ROC: {roc_auc:.3f}")
print(f"AUC-PR: {pr_auc:.3f}")
print(f"Random baseline AUC-PR: {random_baseline:.3f}")
print(f"Improvement over random: {pr_auc/random_baseline:.1f}x")

# 8. THRESHOLD ANALYSIS (focusing on recall for your use case)
print(f"\n=== THRESHOLD ANALYSIS (RECALL-FOCUSED) ===")
thresholds = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.30, 0.50]

best_recall = 0
best_threshold = 0
results = []

for threshold in thresholds:
    y_pred = (y_test_pred_proba >= threshold).astype(int)
    
    if y_pred.sum() > 0:  # Avoid division by zero
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        
        flagged_count = y_pred.sum()
        flagged_pct = flagged_count / len(y_test) * 100
        
        results.append({
            'threshold': threshold,
            'flagged': flagged_count,
            'flagged_pct': flagged_pct,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
        
        print(f"Threshold {threshold:.2f}: {flagged_count:4d} flagged ({flagged_pct:4.1f}%), "
              f"Precision {precision:.3f}, Recall {recall:.3f}, F1 {f1:.3f}")
        
        if recall > best_recall:
            best_recall = recall
            best_threshold = threshold

print(f"\nBest threshold for recall: {best_threshold} (Recall: {best_recall:.3f})")

# 9. COMPARE WITH YOUR PREVIOUS RESULTS
print(f"\n=== COMPARISON WITH PREVIOUS APPROACHES ===")
print("SMOTE-ENN vs Previous Results:")
print(f"Best recall achieved: {best_recall:.3f} (vs ~0.35 from class weights)")
print(f"AUC-PR: {pr_auc:.3f} (vs 0.174 from no modification)")

# 10. TOP PERFORMERS ANALYSIS
results_df = pd.DataFrame(results)
# Sort by recall (since that's your priority)
top_recall = results_df.nlargest(5, 'recall')
print(f"\nTop 5 thresholds by RECALL (your priority):")
print(top_recall[['threshold', 'flagged_pct', 'precision', 'recall', 'f1']].to_string(index=False))

# Also show balanced options
results_df['recall_precision_product'] = results_df['recall'] * results_df['precision']
balanced_options = results_df.nlargest(5, 'recall_precision_product')
print(f"\nTop 5 balanced recall-precision options:")
print(balanced_options[['threshold', 'flagged_pct', 'precision', 'recall', 'f1']].to_string(index=False))

print(f"\n" + "="*50)
print("SMOTE-ENN ANALYSIS COMPLETE")
print("="*50)
print("Key takeaways:")
print(f"1. SMOTE-ENN created {resampled_pos - original_pos} synthetic positive samples")
print(f"2. Best recall achieved: {best_recall:.3f}")
print(f"3. For your use case (capture more buyers), consider threshold around {best_threshold}")
print("4. Ready to compare with ensemble methods next")