In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from pathlib import Path
import os  # Added for directory creation
import lightgbm as lgb

# Load val and model
val = pd.read_csv('data/processed/val_features.csv')
with open('data/processed/feature_columns.json', 'r') as f:
    features = json.load(f)
X_val = val[features]
y_val = val['Class']
model = lgb.Booster(model_file='D:\My Projects\credit-fraud-detection\models\lgbm_model.pkl')
y_scores = model.predict(X_val)

# Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_val, y_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.savefig('data/pr_curve.png')
plt.close()

# Optimal thresholds
f1_scores = 2 * (precision * recall) / (precision + recall)
best_f1_idx = np.argmax(f1_scores)
threshold_f1 = thresholds[best_f1_idx]

# High precision (e.g., top 10% recall)
high_prec_idx = np.argmax(precision[recall > 0.1])
threshold_high_prec = thresholds[high_prec_idx]

# Balanced (e.g., where prec â‰ˆ recall)
balanced_idx = np.argmin(np.abs(precision - recall))
threshold_balanced = thresholds[balanced_idx]

thresholds_dict = {
    'best_f1': float(threshold_f1),
    'high_precision': float(threshold_high_prec),
    'balanced': float(threshold_balanced)
}

with open('data/threshold.json', 'w') as f:
    json.dump(thresholds_dict, f, indent=2)

# Create app directory if it doesn't exist
os.makedirs('app', exist_ok=True)

# Simple rule (using best F1)
val['predicted_score'] = y_scores
val['predicted_label'] = (y_scores > threshold_f1).astype(int)
val['transaction_id'] = range(len(val))  # Proxy ID
results_df = val[['transaction_id', 'Amount', 'Class', 'predicted_score', 'predicted_label']]
results_df.to_csv('app/results.csv', index=False)  # Save as results.csv for Streamlit consistency

print("Thresholding complete. Thresholds saved to data/threshold.json, results to app/results.csv")
print(f"Best F1 Threshold: {threshold_f1:.4f}")
print(f"Predictions shape: {results_df.shape}")

LightGBMError: Could not open data/models/lgbm_model.pkl