In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Core Python & Data Science Libraries
import numpy as np
import pandas as pd

# Preprocessing & Model Tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, confusion_matrix,
    precision_recall_curve
)

# Ensemble Model
from xgboost import XGBClassifier


Mounted at /content/drive


In [6]:
# EDIT these paths to match your Google Drive folder structure
FRAUD_PATH = "/content/drive/MyDrive/fraud_df.csv"
CC_PATH    = "/content/drive/MyDrive/creditcard.csv"


In [7]:
def evaluate_scores(y_true, y_score, name):
    ap  = average_precision_score(y_true, y_score)
    roc = roc_auc_score(y_true, y_score)

    # Default 0.5 threshold
    y_pred05 = (y_score >= 0.5).astype(int)
    f1_05 = f1_score(y_true, y_pred05)
    cm_05 = confusion_matrix(y_true, y_pred05)

    # Best F1 threshold
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    thr = np.append(thr, 1.0)
    f1s = 2 * (prec * rec) / (prec + rec + 1e-12)
    i = int(np.nanargmax(f1s))
    t_best = float(thr[i])
    y_pred_best = (y_score >= t_best).astype(int)
    f1_best = f1_score(y_true, y_pred_best)
    cm_best = confusion_matrix(y_true, y_pred_best)

    print(f"\n[{name}] AUC-PR={ap:.4f} | ROC-AUC={roc:.4f}")
    print(f"  @0.5:   F1={f1_05:.4f}  CM=\n{cm_05}")
    print(f"  @best:  thr={t_best:.3f}  F1={f1_best:.4f}  CM=\n{cm_best}")
    return dict(ap=ap, roc=roc, f1_05=f1_05, cm_05=cm_05, t_best=t_best, f1_best=f1_best, cm_best=cm_best)

def pick_best(name, m_log, m_xgb):
    a = m_log["ap"]; b = m_xgb["ap"]
    best = "XGBoost" if b > a else "LogReg"
    print(f"\n>>> {name}: Best by AUC-PR = {best} "
          f"(LogReg AUC-PR {a:.4f} vs XGB {b:.4f}). "
          f"Tie-break F1@best: LogReg {m_log['f1_best']:.4f}, XGB {m_xgb['f1_best']:.4f}.")


In [8]:
fraud_df = pd.read_csv(FRAUD_PATH)

num_cols = ['purchase_hour','purchase_day','time_since_signup_hrs',
            'user_transaction_count','user_avg_purchase_value',
            'user_purchase_std','is_first_transaction','device_freq','browser_freq']
cat_cols = ['country','browser']

# Drop missing target or feature values
fraud_df = fraud_df.dropna(subset=num_cols + cat_cols + ['class'])

X_f = fraud_df[num_cols + cat_cols]
y_f = fraud_df['class'].astype(int)

Xf_tr, Xf_te, yf_tr, yf_te = train_test_split(X_f, y_f, test_size=0.30, random_state=42, stratify=y_f)

pre_fraud = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

pos_weight_f = (len(yf_tr) - yf_tr.sum()) / max(1, yf_tr.sum())

logreg_f = Pipeline([
    ("pre", pre_fraud),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=500, solver="liblinear", random_state=42))
])

xgb_f = Pipeline([
    ("pre", pre_fraud),
    ("clf", XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        n_jobs=-1, random_state=42, eval_metric="logloss",
        scale_pos_weight=float(pos_weight_f)
    ))
])

# Train
logreg_f.fit(Xf_tr, yf_tr)
xgb_f.fit(Xf_tr, yf_tr)

# Predict & Evaluate
m_log_f = evaluate_scores(yf_te, logreg_f.predict_proba(Xf_te)[:, 1], "LogReg (Fraud)")
m_xgb_f = evaluate_scores(yf_te, xgb_f.predict_proba(Xf_te)[:, 1], "XGBoost (Fraud)")



[LogReg (Fraud)] AUC-PR=0.6602 | ROC-AUC=0.8367
  @0.5:   F1=0.6095  CM=
[[38869  2220]
 [ 1411  2834]]
  @best:  thr=0.765  F1=0.6732  CM=
[[40854   235]
 [ 1972  2273]]

[XGBoost (Fraud)] AUC-PR=0.7061 | ROC-AUC=0.8392
  @0.5:   F1=0.6113  CM=
[[38607  2482]
 [ 1284  2961]]
  @best:  thr=0.872  F1=0.6940  CM=
[[41085     4]
 [ 1987  2258]]


In [9]:
cc = pd.read_csv(CC_PATH).dropna()

X_c = cc.drop(columns=["Class"])
y_c = cc["Class"].astype(int)

Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(X_c, y_c, test_size=0.30, random_state=42, stratify=y_c)

sc_cc = StandardScaler()
Xc_tr_sc = sc_cc.fit_transform(Xc_tr)
Xc_te_sc = sc_cc.transform(Xc_te)

pos_weight_c = (len(yc_tr) - yc_tr.sum()) / max(1, yc_tr.sum())

logreg_c = LogisticRegression(class_weight="balanced", max_iter=500, solver="liblinear", random_state=42)
xgb_c = XGBClassifier(
    n_estimators=600, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    n_jobs=-1, random_state=42, eval_metric="logloss",
    scale_pos_weight=float(pos_weight_c)
)

logreg_c.fit(Xc_tr_sc, yc_tr)
xgb_c.fit(Xc_tr_sc, yc_tr)

m_log_c = evaluate_scores(yc_te, logreg_c.predict_proba(Xc_te_sc)[:, 1], "LogReg (CreditCard)")
m_xgb_c = evaluate_scores(yc_te, xgb_c.predict_proba(Xc_te_sc)[:, 1], "XGBoost (CreditCard)")



[LogReg (CreditCard)] AUC-PR=0.7040 | ROC-AUC=0.9680
  @0.5:   F1=0.1245  CM=
[[83484  1811]
 [   18   130]]
  @best:  thr=1.000  F1=0.8129  CM=
[[85278    17]
 [   35   113]]

[XGBoost (CreditCard)] AUC-PR=0.8462 | ROC-AUC=0.9739
  @0.5:   F1=0.8459  CM=
[[85282    13]
 [   30   118]]
  @best:  thr=0.896  F1=0.8507  CM=
[[85289     6]
 [   34   114]]


In [21]:
# ===== Metrics helpers (paste once) =====
import numpy as np
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, confusion_matrix,
    precision_recall_curve, accuracy_score, precision_score
)

def evaluate_scores(y_true, y_score, name):
    """
    y_true: 1D array of true labels
    y_score: 1D array of predicted probabilities for the positive class
    name: label for printing
    Returns a dict with AUC-PR, ROC-AUC, and Accuracy/Precision/F1 at 0.5 and best-F1 threshold.
    """
    # Ranking metrics
    ap  = average_precision_score(y_true, y_score)
    roc = roc_auc_score(y_true, y_score)

    # --- @ 0.5 ---
    y_pred05 = (y_score >= 0.5).astype(int)
    acc_05  = accuracy_score(y_true, y_pred05)
    prec_05 = precision_score(y_true, y_pred05, zero_division=0)
    f1_05   = f1_score(y_true, y_pred05)
    cm_05   = confusion_matrix(y_true, y_pred05)

    # --- @ best-F1 threshold ---
    prec_curve, rec_curve, thr = precision_recall_curve(y_true, y_score)
    thr = np.append(thr, 1.0)  # align with PR arrays
    f1s = 2 * (prec_curve * rec_curve) / (prec_curve + rec_curve + 1e-12)
    i_best = int(np.nanargmax(f1s))
    t_best = float(thr[i_best])

    y_pred_best = (y_score >= t_best).astype(int)
    acc_best  = accuracy_score(y_true, y_pred_best)
    prec_best = precision_score(y_true, y_pred_best, zero_division=0)
    f1_best   = f1_score(y_true, y_pred_best)
    cm_best   = confusion_matrix(y_true, y_pred_best)

    # pretty print
    print(f"\n[{name}]  AUC-PR={ap:.4f} | ROC-AUC={roc:.4f}")
    print(f"@0.5 :  Accuracy={acc_05:.4f}  Precision={prec_05:.4f}  F1={f1_05:.4f}  CM=\n{cm_05}")
    print(f"@best:  thr={t_best:.3f}  Accuracy={acc_best:.4f}  Precision={prec_best:.4f}  F1={f1_best:.4f}  CM=\n{cm_best}")

    return dict(
        ap=ap, roc=roc,
        acc_05=acc_05, prec_05=prec_05, f1_05=f1_05, cm_05=cm_05,
        acc_best=acc_best, prec_best=prec_best, f1_best=f1_best, cm_best=cm_best,
        t_best=t_best
    )

def pick_best(name, m_log, m_xgb):
    """
    Compare two model results dicts (from evaluate_scores) and print a compact summary.
    Announces winners by AUC-PR, ROC-AUC, Accuracy & Precision (@0.5 and @best-F1),
    then prints detailed metrics for each model.
    """

    def winner(v_log, v_xgb):
        if abs(v_log - v_xgb) < 1e-12:
            return "Tie"
        return "XGBoost" if v_xgb > v_log else "LogReg"

    # Headline winners (threshold-independent first)
    a_ap_log, a_ap_xgb = m_log["ap"], m_xgb["ap"]
    a_roc_log, a_roc_xgb = m_log["roc"], m_xgb["roc"]

    acc05_log, acc05_xgb = m_log["acc_05"], m_xgb["acc_05"]
    prec05_log, prec05_xgb = m_log["prec_05"], m_xgb["prec_05"]

    accb_log, accb_xgb = m_log["acc_best"], m_xgb["acc_best"]
    precb_log, precb_xgb = m_log["prec_best"], m_xgb["prec_best"]

    print(f"\n=== {name} ===")
    print(f"Best by AUC-PR     : {winner(a_ap_log, a_ap_xgb)} (LogReg={a_ap_log:.4f}, XGBoost={a_ap_xgb:.4f})")
    print(f"Best by ROC-AUC    : {winner(a_roc_log, a_roc_xgb)} (LogReg={a_roc_log:.4f}, XGBoost={a_roc_xgb:.4f})")
    print(f"Best Accuracy @0.5 : {winner(acc05_log, acc05_xgb)} (LogReg={acc05_log:.4f}, XGBoost={acc05_xgb:.4f})")
    print(f"Best Precision @0.5: {winner(prec05_log, prec05_xgb)} (LogReg={prec05_log:.4f}, XGBoost={prec05_xgb:.4f})")
    print(f"Best Accuracy @best: {winner(accb_log, accb_xgb)} (LogReg={accb_log:.4f}, XGBoost={accb_xgb:.4f})")
    print(f"Best Precision @best: {winner(precb_log, precb_xgb)} (LogReg={precb_log:.4f}, XGBoost={precb_xgb:.4f})")

    # Detailed per-model section
    print("\nLogReg:")
    print(f"  @0.5 :  Accuracy={m_log['acc_05']:.4f}  Precision={m_log['prec_05']:.4f}  F1={m_log['f1_05']:.4f}")
    print(f"  @best:  Accuracy={m_log['acc_best']:.4f} Precision={m_log['prec_best']:.4f} F1={m_log['f1_best']:.4f} (thr={m_log['t_best']:.3f})")

    print("\nXGBoost:")
    print(f"  @0.5 :  Accuracy={m_xgb['acc_05']:.4f}  Precision={m_xgb['prec_05']:.4f}  F1={m_xgb['f1_05']:.4f}")
    print(f"  @best:  Accuracy={m_xgb['acc_best']:.4f} Precision={m_xgb['prec_best']:.4f} F1={m_xgb['f1_best']:.4f} (thr={m_xgb['t_best']:.3f})")


In [22]:
# ---- Fraud_Data ----
p_log_f = logreg_f.predict_proba(Xf_te)[:, 1]
p_xgb_f = xgb_f.predict_proba(Xf_te)[:, 1]
m_log_f = evaluate_scores(yf_te, p_log_f, "LogReg (Fraud)")
m_xgb_f = evaluate_scores(yf_te, p_xgb_f, "XGBoost (Fraud)")

# ---- CreditCard ----
p_log_c = logreg_c.predict_proba(Xc_te_sc)[:, 1]
p_xgb_c = xgb_c.predict_proba(Xc_te_sc)[:, 1]
m_log_c = evaluate_scores(yc_te, p_log_c, "LogReg (CreditCard)")
m_xgb_c = evaluate_scores(yc_te, p_xgb_c, "XGBoost (CreditCard)")

# ---- Side-by-side summary per dataset ----
pick_best("Fraud_Data", m_log_f, m_xgb_f)
pick_best("CreditCard", m_log_c, m_xgb_c)



[LogReg (Fraud)]  AUC-PR=0.6602 | ROC-AUC=0.8367
@0.5 :  Accuracy=0.9199  Precision=0.5607  F1=0.6095  CM=
[[38869  2220]
 [ 1411  2834]]
@best:  thr=0.765  Accuracy=0.9513  Precision=0.9063  F1=0.6732  CM=
[[40854   235]
 [ 1972  2273]]

[XGBoost (Fraud)]  AUC-PR=0.7061 | ROC-AUC=0.8392
@0.5 :  Accuracy=0.9169  Precision=0.5440  F1=0.6113  CM=
[[38607  2482]
 [ 1284  2961]]
@best:  thr=0.872  Accuracy=0.9561  Precision=0.9982  F1=0.6940  CM=
[[41085     4]
 [ 1987  2258]]

[LogReg (CreditCard)]  AUC-PR=0.7040 | ROC-AUC=0.9680
@0.5 :  Accuracy=0.9786  Precision=0.0670  F1=0.1245  CM=
[[83484  1811]
 [   18   130]]
@best:  thr=1.000  Accuracy=0.9994  Precision=0.8692  F1=0.8129  CM=
[[85278    17]
 [   35   113]]

[XGBoost (CreditCard)]  AUC-PR=0.8462 | ROC-AUC=0.9739
@0.5 :  Accuracy=0.9995  Precision=0.9008  F1=0.8459  CM=
[[85282    13]
 [   30   118]]
@best:  thr=0.896  Accuracy=0.9995  Precision=0.9500  F1=0.8507  CM=
[[85289     6]
 [   34   114]]

=== Fraud_Data ===
Best by AUC-