In [81]:
from sklearn.metrics import accuracy_score
import xgboost as xgb
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [19]:
df = pd.read_csv('data/ecommerce_fraud_data.csv',  parse_dates=["order_date"])

In [20]:
user_avg = df.groupby("user_id")["order_amount"].mean().rename("user_avg_order_value")
df = df.merge(user_avg, on="user_id", how="left")
df["order_value_deviation"] = df["order_amount"] - df["user_avg_order_value"]
pm_freq = df.groupby(["user_id", "payment_type"]).size().rename("payment_method_freq").reset_index()
df = df.merge(pm_freq, on=["user_id", "payment_type"], how="left")
df["device_change_count"] = df.groupby("user_id")["device_type"].transform("nunique")
df["delivery_mismatch"] = np.where(df["billing_region"] != df["shipping_region"], 1, 0)
df["is_cod_flag"] = np.where(df["is_cod"].str.lower()=="yes", 1, 0)
df["order_hour"] = df["order_date"].dt.hour.fillna(0).astype(int)
df["is_night_order"] = ((df["order_hour"] < 6) | (df["order_hour"] > 22)).astype(int)

In [25]:
feature_cols = [
    "order_amount","user_avg_order_value","order_value_deviation","payment_method_freq",
    "device_change_count","delivery_mismatch","ip_address_risk_score","device_trust_score",
    "num_failed_payments","is_cod_flag","loyalty_score","num_prev_orders"
]

categorical = ["payment_type","device_type","browser","billing_region"]

df_model = pd.get_dummies(df[feature_cols+categorical], drop_first=True)
df_model.fillna(0, inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_model)
y = np.where(df["is_fraud"].str.lower()=="yes",1,0)

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=42,stratify=y)


In [27]:
clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42, n_estimators=100)
clf.fit(X_train,y_train)

In [30]:
preds = clf.predict(X_test)
preds = np.round(preds)
accuracy= accuracy_score(y_test,preds)
print('Accuracy of the model is:', accuracy*100)

Accuracy of the model is: 99.7584541062802


In [70]:

model =  IsolationForest(n_estimators=100, random_state=42)
model.fit(X_train)

In [76]:
scores_prediction = model.decision_function(X_scaled)
iso_norm = (iso_scores - iso_scores.min()) / (iso_scores.max() - iso_scores.min())

# Convert to binary (top 35% = fraud)
threshold = np.percentile(iso_norm, 65)
y_pred_iso = (iso_norm >= threshold).astype(int)


#y_pred_iso = model.predict(X_scaled)
#y_pred_iso[y_pred_iso == 1] = 0
#y_pred_iso[y_pred_iso == -1] = 1
#print("Accuracy in finding anomaly:",accuracy_score(y,y_pred_iso))


In [67]:
# XGBoost (supervised)
y_pred_xgb = clf.predict(X_test)
y_prob_xgb = clf.predict_proba(X_test)[:, 1]


In [82]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

# --- XGBoost Metrics ---
print("=== XGBoost (Supervised) ===")
xgb_auc = roc_auc_score(y_test, y_prob_xgb)
xgb_prec = precision_score(y_test, y_pred_xgb)
xgb_rec = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
print(f"AUC: {xgb_auc:.3f}, Precision: {xgb_prec:.3f}, Recall: {xgb_rec:.3f}, F1: {xgb_f1:.3f}")


roc_iso = roc_auc_score(y_test, iso_norm)
prec_iso = precision_score(y_test, y_pred_iso)
rec_iso = recall_score(y_test, y_pred_iso)
f1_iso = f1_score(y_test, y_pred_iso)

print("=== Isolation Forest (Unsupervised) ===")
print(f"ROC-AUC:   {roc_iso:.3f}")
print(f"Precision: {prec_iso:.3f}")
print(f"Recall:    {rec_iso:.3f}")
print(f"F1-Score:  {f1_iso:.3f}")


=== XGBoost (Supervised) ===
AUC: 1.000, Precision: 1.000, Recall: 0.994, F1: 0.997
=== Isolation Forest (Unsupervised) ===
ROC-AUC:   0.337
Precision: 0.303
Recall:    0.250
F1-Score:  0.274


In [83]:
import joblib

# Save both models
joblib.dump(clf, "xgboost_fraud_model.pkl")
joblib.dump(model, "isolation_forest_model.pkl")

print("✅ Models saved successfully!")


✅ Models saved successfully!


In [84]:
xgb_loaded = joblib.load("xgboost_fraud_model.pkl")
iso_loaded = joblib.load("isolation_forest_model.pkl")

print("✅ Models loaded successfully!")

✅ Models loaded successfully!


In [85]:
new_order = {
    "order_amount": 8450,
    "user_avg_order_value": 4200,
    "order_value_deviation": 4250,
    "payment_method_freq": 2,
    "device_change_count": 3,
    "delivery_mismatch": 1,
    "ip_address_risk_score": 0.8,
    "device_trust_score": 0.25,
    "num_failed_payments": 2,
    "is_cod_flag": 1,
    "loyalty_score": 0.3,
    "num_prev_orders": 5,
    "device_consistency": 0.4,
    "payment_type": "Credit Card",
    "device_type": "Mobile",
    "browser": "Chrome",
    "billing_region": "Delhi"
}


In [86]:
import pandas as pd

sample_df = pd.DataFrame([new_order])
categorical = ["payment_type", "device_type", "browser", "billing_region"]
sample_encoded = pd.get_dummies(sample_df, columns=categorical, drop_first=True)

for col in df_model.columns:
    if col not in sample_encoded.columns:
        sample_encoded[col] = 0
sample_encoded = sample_encoded[df_model.columns]

sample_scaled = scaler.transform(sample_encoded)

In [89]:

xgb_prob = xgb_loaded.predict_proba(sample_scaled)[:, 1]
xgb_pred = (xgb_prob >= 0.5).astype(int)

print(f"XGBoost Fraud Probability: {xgb_prob[0]:.2f}")
print("XGBoost Prediction:", "FRAUD" if xgb_pred[0] == 1 else "NOT FRAUD")


XGBoost Fraud Probability: 1.00
XGBoost Prediction: FRAUD


In [90]:
# Predict anomaly score and label
iso_score = iso_loaded.decision_function(sample_scaled)
iso_norm = (iso_score - iso_loaded.offset_) / iso_loaded.offset_
iso_pred = iso_loaded.predict(sample_scaled)
iso_pred = np.where(iso_pred == -1, 1, 0) 

print(f"IsolationForest Anomaly Score: {iso_score[0]:.2f}")
print("IsolationForest Prediction:", "FRAUD" if iso_pred[0] == 1 else "NOT FRAUD")


IsolationForest Anomaly Score: 0.02
IsolationForest Prediction: NOT FRAUD
