In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost fpdf

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from xgboost import XGBClassifier
from fpdf import FPDF
import os

# === Load Dataset ===
df = pd.read_csv(r"C:\Users\AmlanC\OneDrive\Desktop\new database\credit_card_fraud_dataset.csv")
df['fraud_flag'] = df['fraud_flag'].astype(int)
df['transaction_timestamp'] = pd.to_datetime(df['transaction_timestamp'])
df_full = df.copy()  # For merging later

# === Feature Engineering ===
df['hour'] = df['transaction_timestamp'].dt.hour
df['day'] = df['transaction_timestamp'].dt.day
df['weekday'] = df['transaction_timestamp'].dt.weekday

# === Encode Categorical Features ===
model_df = df.copy()
drop_cols = ['transaction_id', 'transaction_timestamp', 'ip_address', 'card_number', 'user_id', 'merchant_id', 'device_id']
model_df.drop(columns=drop_cols, inplace=True)

categorical_cols = ['currency', 'merchant_category_code', 'geo_location', 'channel', 'txn_type', 'txn_status']
for col in categorical_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col])

# === Train-Test Split ===
X = model_df.drop('fraud_flag', axis=1)
y = model_df['fraud_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

# === Model Training ===
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# === Prediction & Threshold Optimization ===
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, xgb_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"✅ Optimal Threshold for F1: {best_threshold:.3f}")

# === ROC Curve ===
fpr, tpr, _ = roc_curve(y_test, xgb_probs)
roc_auc = auc(fpr, tpr)

# === Results DataFrame ===
results = X_test.copy()
results['Risk_Score'] = xgb_probs
results['Predicted_Label'] = (results['Risk_Score'] >= best_threshold).astype(int)
results['Actual_Label'] = y_test.values
full_results = results.merge(df_full, left_index=True, right_index=True)
top_risky_full = full_results.sort_values(by='Risk_Score', ascending=False).head(20)

# === Save CSV ===
csv_path = "top_high_risk_predictions_full.csv"
top_risky_full.to_csv(csv_path, index=False)
print(f"📁 Exported top risky transactions to {csv_path}")

# === Visualizations ===
os.makedirs("plots", exist_ok=True)

# 1. ROC Curve
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
roc_path = "plots/roc_curve.png"
plt.savefig(roc_path)
plt.close()

# 2. Precision-Recall Curve
plt.figure()
plt.plot(recall, precision, label='Precision-Recall')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
pr_path = "plots/precision_recall_curve.png"
plt.savefig(pr_path)
plt.close()

# 3. Fraud Rate Over Time
df_full['date'] = df_full['transaction_timestamp'].dt.date
fraud_rate = df_full.groupby('date')['fraud_flag'].mean()
plt.figure(figsize=(10, 4))
fraud_rate.plot(marker='o')
plt.title("Fraud Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Fraud Rate")
plt.grid(True)
ts_path = "plots/fraud_rate_timeseries.png"
plt.savefig(ts_path)
plt.close()

# === Build PDF Report ===
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", 'B', 16)
pdf.cell(200, 10, "Fraud Detection Report", ln=True, align="C")

pdf.set_font("Arial", '', 12)
pdf.ln(10)
pdf.cell(200, 10, f"Optimal F1 Threshold: {best_threshold:.3f}", ln=True)
pdf.cell(200, 10, f"ROC AUC Score: {roc_auc:.3f}", ln=True)

pdf.set_font("Arial", 'B', 14)
pdf.ln(10)
pdf.cell(200, 10, "Visualizations", ln=True)

for img in [roc_path, pr_path, ts_path]:
    pdf.image(img, x=10, w=180)
    pdf.ln(10)

pdf_path = "fraud_detection_report.pdf"
pdf.output(pdf_path)
print(f"📄 Report saved to {pdf_path}")


Parameters: { "use_label_encoder" } are not used.



✅ Optimal Threshold for F1: 0.718
📁 Exported top risky transactions to top_high_risk_predictions_full.csv
📄 Report saved to fraud_detection_report.pdf
