In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
fraud_data = pd.read_csv('../content/fraud_data_processed.csv')
print("✅ Dataset loaded.")

# Drop ID-like or redundant columns
drop_cols = ['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address']
fraud_data.drop(columns=drop_cols, inplace=True)

# Encode categorical variables
fraud_data = pd.get_dummies(fraud_data, drop_first=True)

# Features and label
X = fraud_data.drop(columns='class')
y = fraud_data['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Evaluation function
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    f1 = f1_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_proba)

    print(f"\n📊 Evaluation for {model_name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-PR: {avg_precision:.4f}")

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'{model_name} (AUC-PR={avg_precision:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name}')
    plt.legend()
    plt.grid()
    plt.show()

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
evaluate_model(log_reg, X_test, y_test, "Logistic Regression")

print("End of logistic regression ####################################################################")

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
evaluate_model(rf_clf, X_test, y_test, "Random Forest")

print("End of Random Forest ####################################################################")

# XGBoost
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)
evaluate_model(xgb_clf, X_test, y_test, "XGBoost Classifier")

print("End of xgBoost ####################################################################")
