In [1]:
# Import Library
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, mean,expr, avg, stddev
from pyspark.sql.functions import lag, coalesce, lit
from pyspark.sql.functions import to_date, date_format
from pyspark.sql.window import Window
from pyspark.sql.functions import col, unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder \
    .appName("Streaming Fraud Detection") \
    .getOrCreate()

In [3]:
data_path = r"credit_card_transactions.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show(5)

+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+
|_c0|trans_date_trans_time|          cc_num|merchant|category|  amt|gender|    lat|     long|city_pop|job|       dob| unix_time|merch_lat| merch_long|is_fraud|age|hour|day|month|
+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+
|  0|  2020-06-13 07:35:03|6511349151405438|      29|       4|166.8|     1|39.3426|-114.8859|     450|254|1946-08-24|1371108903|40.088507|-113.895268|       0| 79|   7|  5|    6|
|  1|  2019-09-12 19:09:06|3566094707272327|     536|      11|28.86|     0|34.3795| -118.523|   34882|219|1971-04-25|1347476946|35.356925|-119.348148|       0| 54|  19|  3|    9|
|  2|  2020-02-14 05:31:05|3573030041201292|     153|       2|37.93|     0|40.3207| -110.436|     302|406

In [4]:
df.printSchema()
print(f"Rows: {df.count()}, Columns: {len(df.columns)}")

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: integer (nullable = true)
 |-- category: integer (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)

Rows: 11, Columns: 20


In [5]:
df.describe().show()

+-------+---------------+--------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+------------------+-----------------+------------------+------------------+
|summary|            _c0|              cc_num|          merchant|         category|               amt|            gender|               lat|              long|         city_pop|               job|           unix_time|         merch_lat|        merch_long|           is_fraud|               age|             hour|               day|             month|
+-------+---------------+--------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+------------------+----

In [6]:
missing_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
missing_values.show()
df = df.dropna()

+---+---------------------+------+--------+--------+---+------+---+----+--------+---+---+---------+---------+----------+--------+---+----+---+-----+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|gender|lat|long|city_pop|job|dob|unix_time|merch_lat|merch_long|is_fraud|age|hour|day|month|
+---+---------------------+------+--------+--------+---+------+---+----+--------+---+---+---------+---------+----------+--------+---+----+---+-----+
|  0|                    0|     0|       0|       0|  0|     0|  0|   0|       0|  0|  0|        0|        0|         0|       0|  0|   0|  0|    0|
+---+---------------------+------+--------+--------+---+------+---+----+--------+---+---+---------+---------+----------+--------+---+----+---+-----+



In [7]:
from pyspark.sql.functions import col, to_date

# Convert to timestamp 
df = df.withColumn("trans_date_trans_time", col("trans_date_trans_time").cast("timestamp"))

# Create a new column containing only the date (while keeping the original column).
df = df.withColumn("trans_date", to_date(col("trans_date_trans_time")))
df.show(5)

+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+----------+
|_c0|trans_date_trans_time|          cc_num|merchant|category|  amt|gender|    lat|     long|city_pop|job|       dob| unix_time|merch_lat| merch_long|is_fraud|age|hour|day|month|trans_date|
+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+----------+
|  0|  2020-06-13 07:35:03|6511349151405438|      29|       4|166.8|     1|39.3426|-114.8859|     450|254|1946-08-24|1371108903|40.088507|-113.895268|       0| 79|   7|  5|    6|2020-06-13|
|  1|  2019-09-12 19:09:06|3566094707272327|     536|      11|28.86|     0|34.3795| -118.523|   34882|219|1971-04-25|1347476946|35.356925|-119.348148|       0| 54|  19|  3|    9|2019-09-12|
|  2|  2020-02-14 05:31:05|3573030041201292|     1

In [8]:
# order the dates in ascending order
df = df.orderBy('trans_date_trans_time')
# Calculate split index
split_index = int(df.count() * 0.8)

# Split the dataset into training and testing sets
train = df.limit(split_index)  # Take the first 80% of rows
test = df.subtract(train)      # Subtract the training set from the original DataFrame to get the test set
test_copy = test.select("*") 
# Display row counts of the resulting DataFrames to verify the split
print(f"Training set row count: {train.count()}")
print(f"Testing set row count: {test.count()}")
test.show(5)

Training set row count: 8
Testing set row count: 3
+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+----------+
|_c0|trans_date_trans_time|          cc_num|merchant|category|  amt|gender|    lat|     long|city_pop|job|       dob| unix_time|merch_lat| merch_long|is_fraud|age|hour|day|month|trans_date|
+---+---------------------+----------------+--------+--------+-----+------+-------+---------+--------+---+----------+----------+---------+-----------+--------+---+----+---+-----+----------+
|  7|  2020-08-23 10:10:29|4147608975828480|     548|       2|46.64|     1|44.1111| -94.9134|     914|218|1944-07-26|1377252629|43.255179| -94.744586|       0| 81|  10|  6|    8|2020-08-23|
|  6|  2020-11-29 09:34:32|3560697798177746|     518|       2|75.57|     1|33.7163|-116.3381|    4677|265|1955-05-06|1385717672|34.696349|-115.852896|       0| 70|   9|  6|   11|2020-11-29|

In [9]:
feature_columns = [ "merchant", "category", "amt", "gender", "lat", "long", "city_pop", "job", "unix_time", "merch_lat", "merch_long", "age", "hour", "day", "month"]

In [10]:
# Split data chronologically (80% train, 20% test)
total_rows = df.count()
train_rows = int(total_rows * 0.8)
train_df = df.orderBy("trans_date_trans_time").limit(train_rows)
test_df = df.orderBy("trans_date_trans_time").exceptAll(train_df)

In [11]:
print(f"Training set row count: {train_df.count()}")
print(f"Testing set row count: {test_df.count()}")

Training set row count: 8
Testing set row count: 3


In [12]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import unix_timestamp

# Convert the timestamp to a Unix timestamp (if not already).
df = df.withColumn("unix_time", unix_timestamp("trans_date_trans_time").cast("int"))

# Create a data processing pipeline.
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[assembler, scaler])

# Train the pipeline on the dataset.
transformer = pipeline.fit(df)

# Transform the data.
df = transformer.transform(df).select("trans_date_trans_time", "scaled_features", "is_fraud")
train_set = transformer.transform(train_df).select("trans_date_trans_time", "scaled_features", "is_fraud")
test_set = transformer.transform(test_df).select("trans_date_trans_time", "scaled_features", "is_fraud")

train_set.show(5)


+---------------------+--------------------+--------+
|trans_date_trans_time|     scaled_features|is_fraud|
+---------------------+--------------------+--------+
|  2019-05-02 10:59:01|[0.90136570561456...|       0|
|  2019-09-12 19:09:06|[0.76934749620637...|       0|
|  2020-02-14 05:31:05|[0.18816388467374...|       0|
|  2020-04-11 12:47:16|[0.09408194233687...|       0|
|  2020-05-08 00:33:15|[0.98330804248861...|       0|
+---------------------+--------------------+--------+
only showing top 5 rows



# <div style="text-align: center; background-color: white; font-family: Lobster;color: black; padding: 14px; line-height: 1;border-radius:12px"> Logistic Regression</div>

In [13]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Logistic Regression Model
logistic_regressor = LogisticRegression(
    featuresCol='scaled_features', 
    labelCol='is_fraud'
)

# Use `BinaryClassificationEvaluator` instead of `RegressionEvaluator`.
evaluator = BinaryClassificationEvaluator(
    labelCol='is_fraud', 
    rawPredictionCol='prediction', 
    metricName='areaUnderROC'  # Có thể đổi thành 'areaUnderPR' nếu cần
)

paramGrid = ParamGridBuilder() \
    .addGrid(logistic_regressor.regParam, [0.001, 0.01, 0.1]) \
    .addGrid(logistic_regressor.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(logistic_regressor.maxIter, [50, 100, 300]) \
    .addGrid(logistic_regressor.tol, [1e-6, 1e-4, 1e-2]) \
    .build()

In [14]:
# Cross Validation with numFolds=3
crossval = CrossValidator(
    estimator=logistic_regressor,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

# Train the best model on the training set.
cv_model = crossval.fit(train_set)

# Retrieve the best model from Cross Validation.
best_model = cv_model.bestModel

print("Best Model Params:")
print("  Regularization Param (regParam):", best_model.getRegParam())
print("  ElasticNet Param (elasticNetParam):", best_model.getElasticNetParam())
print("  Maximum Iterations (maxIter):", best_model.getMaxIter())
print("  Tolerance (tol):", best_model.getTol())
print("  Threshold:", best_model.getThreshold())

Best Model Params:
  Regularization Param (regParam): 0.001
  ElasticNet Param (elasticNetParam): 0.0
  Maximum Iterations (maxIter): 50
  Tolerance (tol): 1e-06
  Threshold: 0.5


In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Make predictions on the test set.
prediction_test = best_model.transform(test_set)

# Evaluate the model based on AUC-ROC and AUC-PR.
evaluator_roc = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    rawPredictionCol="prediction",
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    rawPredictionCol="prediction",
    metricName="areaUnderPR"
)

# Calculate the evaluation metrics.
auc_roc = evaluator_roc.evaluate(prediction_test)
auc_pr = evaluator_pr.evaluate(prediction_test)

print("AUC-ROC:", auc_roc)
print("AUC-PR:", auc_pr)


AUC-ROC: 0.0
AUC-PR: 0.0


In [16]:
predictions = test_set.join(prediction_test.select("trans_date_trans_time","prediction"), on="trans_date_trans_time", how="left")
predictions.show(5)

+---------------------+--------------------+--------+----------+
|trans_date_trans_time|     scaled_features|is_fraud|prediction|
+---------------------+--------------------+--------+----------+
|  2020-08-23 10:10:29|[0.78755690440060...|       0|       0.0|
|  2020-11-29 09:34:32|[0.74203338391502...|       0|       0.0|
|  2020-09-05 02:31:11|[1.0,0.6666666666...|       0|       0.0|
+---------------------+--------------------+--------+----------+



In [17]:
predictions = predictions.toPandas()
predictions = predictions.drop("scaled_features", axis=1)
predictions.head(5)

Unnamed: 0,trans_date_trans_time,is_fraud,prediction
0,2020-08-23 10:10:29,0,0.0
1,2020-11-29 09:34:32,0,0.0
2,2020-09-05 02:31:11,0,0.0


In [18]:
# Import Library new for Random Forest Classifier, Gradient-Boosted Trees (GBT) Classifier, Decision Tree Classifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


# <div style="text-align: center; background-color: white; font-family: Lobster;color: black; padding: 14px; line-height: 1;border-radius:12px"> Random Forest Classifier</div>

In [19]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Random Forest Classifier
rf_classifier = RandomForestClassifier(
    featuresCol='scaled_features', 
    labelCol='is_fraud'
)

# Evaluator use AUC-ROC
evaluator = BinaryClassificationEvaluator(
    labelCol='is_fraud', 
    rawPredictionCol='prediction', 
    metricName='areaUnderROC'
)

# Create grid search for hyperparameters
paramGrid = ParamGridBuilder() \
    .addGrid(rf_classifier.numTrees, [50, 100, 200]) \
    .addGrid(rf_classifier.maxDepth, [5, 10, 20]) \
    .addGrid(rf_classifier.minInstancesPerNode, [1, 5, 10]) \
    .build()

# Cross-validation with numFolds=3
crossval = CrossValidator(
    estimator=rf_classifier,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)


cv_model = crossval.fit(train_set)


best_model = cv_model.bestModel

print("Best Model Params:")
print("  Num Trees:", best_model.getNumTrees)
print("  Max Depth:", best_model.getMaxDepth())


prediction_test = best_model.transform(test_set)


auc_roc = evaluator.evaluate(prediction_test)
print("AUC-ROC:", auc_roc)


Best Model Params:
  Num Trees: 50
  Max Depth: 5
AUC-ROC: 0.0


# <div style="text-align: center; background-color: white; font-family: Lobster;color: black; padding: 14px; line-height: 1;border-radius:12px"> LSTM</div>

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import numpy as np

# Convert PySpark data to NumPy for use with TensorFlow
train_features = np.array(train_set.select("scaled_features").toPandas()["scaled_features"].tolist())
train_labels = np.array(train_set.select("is_fraud").toPandas()["is_fraud"])

test_features = np.array(test_set.select("scaled_features").toPandas()["scaled_features"].tolist())
test_labels = np.array(test_set.select("is_fraud").toPandas()["is_fraud"])

# Check the size before reshaping
print("Train Features Shape:", train_features.shape)
print("Test Features Shape:", test_features.shape)

# Reshape data for LSTM
train_features = train_features.reshape((train_features.shape[0], train_features.shape[1], 1))
test_features = test_features.reshape((test_features.shape[0], test_features.shape[1], 1))

# Create LSTM
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(train_features.shape[1], 1)),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(train_features, train_labels, epochs=10, batch_size=32, validation_data=(test_features, test_labels))

# Evaluate
loss, accuracy = model.evaluate(test_features, test_labels)
print("LSTM Model Accuracy:", accuracy)


Train Features Shape: (8, 15)
Test Features Shape: (3, 15)


  super().__init__(**kwargs)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.7500 - loss: 0.6903 - val_accuracy: 1.0000 - val_loss: 0.6675
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.8750 - loss: 0.6773 - val_accuracy: 1.0000 - val_loss: 0.6517
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - accuracy: 0.8750 - loss: 0.6701 - val_accuracy: 1.0000 - val_loss: 0.6354
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step - accuracy: 0.8750 - loss: 0.6538 - val_accuracy: 1.0000 - val_loss: 0.6181
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.8750 - loss: 0.6405 - val_accuracy: 1.0000 - val_loss: 0.5994
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.8750 - loss: 0.6335 - val_accuracy: 1.0000 - val_loss: 0.5788
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━

In [21]:
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator

# 1️⃣ Logistic Regression
lr = LogisticRegression(featuresCol="scaled_features", labelCol="is_fraud")
lr_model = lr.fit(train_set)
prediction_test = lr_model.transform(test_set)

# 2️⃣ Random Forest Classifier
rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="is_fraud", numTrees=50)
rf_model = rf.fit(train_set)
rf_predictions = rf_model.transform(test_set)

# 3️⃣ lSTM


# Dictionary containing models and their corresponding predictions
models = {
    "Logistic Regression": prediction_test, 
    "Random Forest": rf_predictions, 
}

# Define the evaluators
evaluator_roc = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    rawPredictionCol="rawPrediction",  # Sửa lại rawPrediction thay vì prediction
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderPR"
)

evaluator_mse = RegressionEvaluator(
    labelCol="is_fraud",
    predictionCol="prediction",
    metricName="mse"
)

evaluator_rmse = RegressionEvaluator(
    labelCol="is_fraud",
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_mae = RegressionEvaluator(
    labelCol="is_fraud",
    predictionCol="prediction",
    metricName="mae"
)

evaluator_r2 = RegressionEvaluator(
    labelCol="is_fraud",
    predictionCol="prediction",
    metricName="r2"
)

# Evaluate
for model_name, predictions in models.items():
    auc_roc = evaluator_roc.evaluate(predictions)
    auc_pr = evaluator_pr.evaluate(predictions)
    mse = evaluator_mse.evaluate(predictions)
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)
    
    print(f"\n🔹 {model_name} Evaluation Metrics:")
    print(f"   ✅ AUC-ROC: {auc_roc:.4f}")
    print(f"   ✅ AUC-PR: {auc_pr:.4f}")
    print(f"   ✅ MSE: {mse:.4f}")
    print(f"   ✅ RMSE: {rmse:.4f}")
    print(f"   ✅ MAE: {mae:.4f}")
    print(f"   ✅ R² Score: {r2:.4f}")



🔹 Logistic Regression Evaluation Metrics:
   ✅ AUC-ROC: 0.0000
   ✅ AUC-PR: 0.0000
   ✅ MSE: 0.0000
   ✅ RMSE: 0.0000
   ✅ MAE: 0.0000
   ✅ R² Score: nan

🔹 Random Forest Evaluation Metrics:
   ✅ AUC-ROC: 0.0000
   ✅ AUC-PR: 0.0000
   ✅ MSE: 0.0000
   ✅ RMSE: 0.0000
   ✅ MAE: 0.0000
   ✅ R² Score: nan
