In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import mlflow
import mlflow.xgboost



### Load Parquet dataset from feature store

In [0]:
feature_df = spark.read.format('parquet').load("/mnt/final-dataset/")

### Convert to Pandas DataFrame

In [0]:
pdf = feature_df.select("TransactionAmount", "AnomalyScore", "HourOfTransaction", "RecentLoginGapDays", "IsHighAmount", "FraudIndicator").dropna().toPandas()

### Split data

In [0]:
X = pdf.drop("FraudIndicator", axis=1)
y = pdf["FraudIndicator"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Train and evaluate model (tracked via MLflow)

In [0]:
with mlflow.start_run():
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train) 
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]



In [0]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[188   3]
 [  9   0]]


In [0]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       191
           1       0.00      0.00      0.00         9

    accuracy                           0.94       200
   macro avg       0.48      0.49      0.48       200
weighted avg       0.91      0.94      0.93       200



In [0]:
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))


ROC AUC Score: 0.3973240255962769
