In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml import Pipeline
from preprocessing.preprocessor import split_data, class_imbalance

%load_ext autoreload
%autoreload 2

In [6]:
spark = SparkSession.builder.appName("Pipeline").getOrCreate()

24/12/06 19:17:23 WARN Utils: Your hostname, Nikhils-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.206 instead (on interface en0)
24/12/06 19:17:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/06 19:17:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/06 19:17:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/06 19:17:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/06 19:17:24 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/12/06 19:17:24 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/12/06 19:17:24 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempti

In [7]:
# Load data
data = spark.read.csv("dataset/creditcard.csv", header=True, inferSchema=True).cache()
data.show(5)

24/12/06 19:17:28 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

In [8]:
# Preprocessing

# Create an assembler as the first stage of the pipeline
assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features")

# Create a scaler as the second stage of the pipeline
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

## Model Saving and Loading
Since we have already trained a model and saved it, we can load it instead of training a new model. However, we will check if the model exists in the models directory. If it does not exist, we will train a new model and save it.

In [9]:
# Check if we already have a model saved in models directory
# If not, create a new model
# If we have a model saved, load it
rf_model = None 
try:
    rf_model = RandomForestClassificationModel.load("models/credit_card_fraud_detection_model")
except Exception as e:
    print("Model not found. Training a new model.")
    rf_model = RandomForestClassifier(featuresCol="scaled_features", labelCol="Class") 



In [10]:
# Create a pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf_model])

# Pipeline restriction
Now that we are inside the pipeline, the assembling and scaling will happen to the data we feed in

In [11]:
# Split training and testing data
df_balanced = class_imbalance(data)
train_split, test_split = split_data(df_balanced, 42)
train_unsplit, test_unsplit = split_data(data, 42)



Fraudulent transactions: 492
Non-fraudulent transactions: 284315 before balancing


                                                                                

In [12]:
print("Split data")
print(f"Train data count: {train_split.count()}")
print(f"Test data count: {test_split.count()}")

print("\nUnsplit data")
print(f"Train data count: {train_unsplit.count()}")
print(f"Test data count: {test_unsplit.count()}")

Split data
Train data count: 836
Test data count: 188

Unsplit data
Train data count: 228045
Test data count: 56762


In [21]:
# Fit the pipeline with unbalanced data
import time
start = time.perf_counter()
pipeline_model = pipeline.fit(train_unsplit)
end = time.perf_counter()
print(f"Time taken for training: {end-start:.6f}s")

[Stage 46:>                                                         (0 + 8) / 8]

Time taken for training: 1.126900s


                                                                                

In [20]:
# Make predictions
start = time.perf_counter()
predictions = pipeline_model.transform(test_unsplit)
end = time.perf_counter()
print(f"Time taken for prediction: {end-start:.6f}s")

Time taken for prediction: 0.092956s


In [None]:
predictions.show(5)

In [15]:
# Save the pipeline
# pipeline_model.write().overwrite().save("pipeline/credit_card_fraud_detection_pipeline")

In [16]:
predictions.printSchema()

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [17]:
# Evaluate the model

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="Class")
print(f"Area under ROC: {evaluator.evaluate(predictions)}")

Area under ROC: 0.9847613183879208


In [18]:
# Plot ROC curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Convert the predictions to a Pandas DataFrame
predictions_df = predictions.select('Class', 'probability').toPandas()

# Get the false positive rate and true positive rate
y_true = predictions_df.select("Class").rdd.flatMap(lambda x: x).collect()
y_scores = predictions_df.select("probability").rdd.flatMap(lambda x: x[0][1]).collect()

fpr, tpr, _ = roc_curve(y_true, y_scores)

# Calculate the AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label=f'ROC curve (area = {roc_auc:.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

AttributeError: 'DataFrame' object has no attribute 'select'