In [None]:
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import networkx as nx
from sklearn.metrics import classification_report, accuracy_score, log_loss,average_precision_score,confusion_matrix,ConfusionMatrixDisplay

import plotly.express as px
import seaborn as sns

#used for calling models from other notebooks
from joblib import load

In [None]:
#final_features=pd.read_csv('total_features', sep=',', header=0) 
#final_features=pd.read_csv('total_features', sep=',', header=0) 
final_features = pd.read_csv('total_features', sep=',', header=0)
total_data = pd.read_csv('fraud_payment_data', sep=',', header=0)

In [None]:
total_data=total_data[total_data.USD_amount>0]
total_data=total_data.reset_index(drop=True)
total_data


In [None]:
#confirming the length of both tables before merging
len(final_features), len(total_data)


In [None]:
#Sanity check to verify the two dfs are align
(final_features.index == total_data.index).all()

In [None]:
#Adding USD to final features because they were not included in the modeling features 
final_features["USD_amount"] = total_data["USD_amount"]

In [None]:
##Split for train, validate and test data
train_features=final_features[0:1000000]
validate_features=final_features[1000000:1250000]
test_features=final_features[1250000:-1]

y_train=total_data['Label'][0:1000000]
y_validate=total_data['Label'][1000000:1250000]
y_test=total_data['Label'][1250000:-1]

##Scaler for eventual Logistic regression
X=StandardScaler().fit_transform(train_features.values)

In [None]:
#Running XGboost to identify threshold  
from xgboost import XGBClassifier
import pandas as pd

# Train model
xgb_model= XGBClassifier(
    max_depth=70,
    learning_rate=0.01,
    n_estimators=700,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=10  # fraud imbalance handling
)

xgb_model.fit(train_features, y_train)

# # Use the trained logistic regression model to predict how likely each transaction in X_test is to be fraudulent.
# y_test_xgb_pr = xgb_model.predict_proba(X_test)[:,1]
# Predict probabilities




                                               Business KPIs

In [None]:

# Create DataFrame from X_test
df_test = pd.DataFrame(test_features)

# Add the true fraud labels
df_test["is_fraud"] = y_test.values

# Add the model's continuous score (probability between 0 and 1)
df_test["model_score"] = xgb_model.predict_proba(test_features)[:, 1]

#sanity check column for upcoming calculations 
fraud_df =df_test[df_test["is_fraud"] == 1]

fraud_df.head()

In [None]:
#modelscore - it's a continous scrore from the model that gives a 0-1 probability 
df_test["model_score"] = xgb_model.predict_proba(test_features)[:, 1]

In [None]:
#model flag 
# Threshold to create model_flag
threshold = 0.5 #the ideal threshold determine in our analysis 
df_test["model_flag"] = (df_test["model_score"] >= threshold).astype(int)

df_test


In [None]:
#√Synthetic loss or fraud Loss Avoided score
#missed fraud risk score
#√loss avoided accumulates only when we catch fraud
#
df_test["synthetic_loss"] = df_test["USD_amount"] * df_test["is_fraud"]
df_test["missed_risk"] = df_test["synthetic_loss"] * (1- df_test["model_flag"])
df_test["loss_avoided"] = df_test["synthetic_loss"] * df_test["model_flag"]


                                               Synthetic Loss, Missed Risk and Loss Avoided

In [None]:
total_loss = df_test["synthetic_loss"].sum()
missed_risk = df_test["missed_risk"].sum()
loss_avoided = df_test["loss_avoided"].sum()

missed_risk_pct = missed_risk / total_loss
loss_avoided_pct = loss_avoided / total_loss


print("total loss $:", total_loss,
       "loss avoided", loss_avoided,
      "missed risk", missed_risk)

In [None]:
#Vizualizing model performace on Fraud Losses
plt.bar(["Loss avoided", "Missed risk"], [loss_avoided, missed_risk], color=["#42A2B9", "crimson"])
plt.title("Model Performance on Fraud Losses")
plt.ylabel("USD (Millions)")
plt.show()


## Alert Cost Simulation – The Cost of Analysts Reviewing Alerts

Realistically, the estimated cost per analyst ranges from:  
$1–$5 for FinTech firms  
$3–$12 for mid-size banks  
$20–$70 for larger banks  

Since our data is synthetic data from **JPMorgan Chase**, we use the **average cost** from larger banks.  

**Assumption:**  
The average analyst at JPMorgan Chase earns **$50/hour**.  
Given approximately **12 minutes** to review the average case,  
we estimate a **Synthetic Average Review Cost** of:  


$$
\$10 = \left(\frac{\$50}{60\ \text{minutes}}\right) \times 12\ \text{minutes}
$$



In [None]:
synthetic_Avg_review_cost = (50/60) * 12

In [None]:
#review cost 
df_test["review_cost"] = df_test["model_flag"] * synthetic_Avg_review_cost
total_review_cost = df_test["review_cost"].sum()
print("Total review cost $ :", total_review_cost)

In [None]:
#Total loss avoided (dollars)
total_loss_avoided = df_test.loc[df_test["loss_avoided"] > 0, "loss_avoided"].sum()

print("Total loss avoided in $:", total_loss_avoided)

In [None]:
#Count of positive loss avoided 
fraud_caught = (df_test["loss_avoided"] > 0).sum()
fraud_caught

In [None]:
#Comparing to count of fraud 
total_fraud = (df_test["is_fraud"] == 1).sum()
total_fraud

In [None]:
#rate of fraud caught 
fraud_catch_rate = fraud_caught/total_fraud
fraud_catch_rate

In [None]:
#false positive rate 
#% of legit transactions incorrectly flagged as fraud
fpr = df_test[(df_test.is_fraud==0) & (df_test.model_flag==1)].shape[0] / df_test[df_test.is_fraud==0].shape[0]

#FNR = percentage of fraudulent transactions that model missed
fnr = df_test[(df_test.is_fraud==1) & (df_test.model_flag==0)].shape[0] / df_test[df_test.is_fraud==1].shape[0]

print("False Positive Rate is:", fpr)
print("False Negative Rate is:", fnr)