In [0]:

#Load Data
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.table("default.creditcard_cleaned")  
print("Data loaded successfully.")


Data loaded successfully.


In [0]:
#Preprocessing
from pyspark.ml.feature import VectorAssembler, StandardScaler

# Combine features into one vector
assembler = VectorAssembler(
    inputCols=[c for c in df.columns if c not in ["Class"]],
    outputCol="features_unscaled"
)
if "features_unscaled" in df.columns:
    df = df.drop("features_unscaled")

df_vec = assembler.transform(df)

# Scale the features
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=False)
df_scaled = scaler.fit(df_vec).transform(df_vec)

print("Data scaled successfully.")


Data scaled successfully.


In [0]:
#Train-Test Split
train_df, test_df = df_scaled.randomSplit([0.8, 0.2], seed=42)


In [0]:
#Train Model
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol="features", labelCol="Class")
model = lr.fit(train_df)
preds = model.transform(test_df)

evaluator = BinaryClassificationEvaluator(labelCol="Class", metricName="areaUnderROC")
auc = evaluator.evaluate(preds)
print(f" Model trained. AUC = {auc:.4f}")


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model trained. AUC = 0.9560


In [0]:
#Save Model & Metrics

model.write().overwrite().save("/mnt/models/fraud_detection_lr")
print("Model saved successfully.")


Model saved successfully.


In [0]:
#Log with MLflow
import mlflow
mlflow.start_run()
mlflow.log_metric("AUC", auc)
mlflow.spark.log_model(model, "fraud_detection_model")
mlflow.end_run()


2025/10/27 21:51:09 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

2025/10/27 21:51:10 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
#Print confirmation
print(" Fraud Detection Pipeline executed successfully!")


 Fraud Detection Pipeline executed successfully!
