In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import torch

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, mean,expr, avg, stddev
from pyspark.sql.functions import lag, coalesce, lit
from pyspark.sql.functions import to_date, date_format, to_timestamp
from pyspark.sql.window import Window
from pyspark.sql.functions import col, unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import warnings 
warnings.filterwarnings('ignore')

In [33]:
print(torch.__version__)  # Kiểm tra phiên bản PyTorch
print(torch.cuda.is_available())  # Kiểm tra GPU đã hoạt động chưa
print(torch.cuda.get_device_name(0))  # In tên GPU của bạn

2.5.1+cu121
True
NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [34]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Streaming Fraud Detection System") \
    .getOrCreate()    

In [35]:
train_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTrain.csv'
test_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTest.csv'   

In [36]:
train_data = spark.read.csv(train_path, header=True, inferSchema=True)
test_data = spark.read.csv(test_path, header=True, inferSchema=True)

In [37]:
df = train_data.union(test_data)
df.show(5)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+
|_c0|trans_date_trans_time|          cc_num|            merchant|     category|   amt|    first|   last|gender|              street|          city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat| merch_long|is_fraud|
+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+
|  0|  2019-01-01 00:00:18|2703186189652095|fraud_Rippin, Kub...|     misc_net|  4.97| Jennifer|  Banks|     F|      561 Perry 

In [38]:
df.printSchema()
print(f"Rows: {df.count()}, Columns: {len(df.columns)}")

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

Rows: 1852394, Columns: 23


In [39]:
df.describe().show()

+-------+------------------+--------------------+-------------------+-------------+------------------+-------+-------+-------+--------------------+-------+-------+------------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-----------------+------------------+--------------------+
|summary|               _c0|              cc_num|           merchant|     category|               amt|  first|   last| gender|              street|   city|  state|               zip|              lat|              long|         city_pop|               job|           trans_num|           unix_time|        merch_lat|        merch_long|            is_fraud|
+-------+------------------+--------------------+-------------------+-------------+------------------+-------+-------+-------+--------------------+-------+-------+------------------+-----------------+------------------+-----------------+------------------+--------------------+---------

In [40]:
missing_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
missing_values.show()
df = df.dropna()

+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|  0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+



In [None]:
from pyspark.sql.functions import col, year, month, dayofmonth, hour, to_date

df = df.withColumn("trans_date_trans_time", to_timestamp(col("trans_date_trans_time"), "dd/MM/yyyy HH:mm:ss"))
df = df.withColumn("dob", to_date(col("dob"), "yyyy-MM-dd"))

df = df.withColumn("age", year(col("trans_date_trans_time")) - year(col("dob"))) \
       .withColumn("hour", hour(col("trans_date_trans_time"))) \
       .withColumn("day", dayofmonth(col("trans_date_trans_time"))) \
       .withColumn("month", month(col("trans_date_trans_time")))    

In [42]:
df = df.orderBy("trans_date_trans_time")
df.show(5)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+---+----+---+-----+
|_c0|trans_date_trans_time|          cc_num|            merchant|     category|   amt|    first|   last|gender|              street|          city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat| merch_long|is_fraud|age|hour|day|month|
+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+---+----+---+-----+
|  0|  2019-01-01 00:00:18|2703186189652095|fraud_Rippin, Kub...|     

In [43]:
columns_to_drop = ['first', 'last', 'street', 'city', 'state', 'zip', 'trans_num']
df = df.drop(*columns_to_drop)
df.show(5)

+---+---------------------+----------------+--------------------+-------------+------+------+-------+---------+--------+--------------------+----------+----------+------------------+-----------+--------+---+----+---+-----+
|_c0|trans_date_trans_time|          cc_num|            merchant|     category|   amt|gender|    lat|     long|city_pop|                 job|       dob| unix_time|         merch_lat| merch_long|is_fraud|age|hour|day|month|
+---+---------------------+----------------+--------------------+-------------+------+------+-------+---------+--------+--------------------+----------+----------+------------------+-----------+--------+---+----+---+-----+
|  0|  2019-01-01 00:00:18|2703186189652095|fraud_Rippin, Kub...|     misc_net|  4.97|     F|36.0788| -81.1781|    3495|Psychologist, cou...|1988-03-09|1325376018|         36.011293| -82.048315|       0| 31|   0|  1|    1|
|  1|  2019-01-01 00:00:44|    630423337322|fraud_Heller, Gut...|  grocery_pos|107.23|     F|48.8878|-118.21

## Encoder categorical (StringIndexer)

In [44]:
from pyspark.ml.feature import StringIndexer

categorical_cols = ['merchant', 'category', 'gender', 'job']

for col_name in categorical_cols:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_Indexer")
    df = indexer.fit(df).transform(df)
    
df = df.drop(*categorical_cols)
    
df.show(5)

+---+---------------------+----------------+------+-------+---------+--------+----------+----------+------------------+-----------+--------+---+----+---+-----+----------------+----------------+--------------+-----------+
|_c0|trans_date_trans_time|          cc_num|   amt|    lat|     long|city_pop|       dob| unix_time|         merch_lat| merch_long|is_fraud|age|hour|day|month|merchant_Indexer|category_Indexer|gender_Indexer|job_Indexer|
+---+---------------------+----------------+------+-------+---------+--------+----------+----------+------------------+-----------+--------+---+----+---+-----+----------------+----------------+--------------+-----------+
|  0|  2019-01-01 00:00:18|2703186189652095|  4.97|36.0788| -81.1781|    3495|1988-03-09|1325376018|         36.011293| -82.048315|       0| 31|   0|  1|    1|           584.0|            11.0|           0.0|      129.0|
|  1|  2019-01-01 00:00:44|    630423337322|107.23|48.8878|-118.2105|     149|1978-06-21|1325376044|49.1590469999999

#### Spliting Into Training and Testing Datasets

In [45]:
split_index = int(df.count()* 0.8)
train_df  = df.limit(split_index)
test_df = df.subtract(train_df)

# Display row counts of the resulting DataFrames to verify the split
print(f"Training set row count: {train_df.count()}")
print(f"Testing set row count: {test_df.count()}")

Training set row count: 1481915
Testing set row count: 370479


In [46]:
feature_columns = [ "merchant_Indexer", "category_Indexer", "amt", "gender_Indexer", "lat", "long", "city_pop", "job_Indexer", "unix_time", "merch_lat", "merch_long", "age", "hour", "day", "month"]

In [None]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import unix_timestamp

# Convert the timestamp to a Unix timestamp (if not already).
#df = df.withColumn("unix_time", unix_timestamp("trans_date_trans_time").cast("int"))

# Create a data processing pipeline.
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[assembler, scaler])

# Train the pipeline on the dataset.
transformer = pipeline.fit(df)

# Transform the data.
df = transformer.transform(df).select("trans_date_trans_time", "scaled_features", "is_fraud")
train_set = transformer.transform(train_df).select("trans_date_trans_time", "scaled_features", "is_fraud")
test_set = transformer.transform(test_df).select("trans_date_trans_time", "scaled_features", "is_fraud")

train_set.show(5)   


+---------------------+--------------------+--------+
|trans_date_trans_time|     scaled_features|is_fraud|
+---------------------+--------------------+--------+
|  2019-01-01 00:00:18|[0.84393063583815...|       0|
|  2019-01-01 00:00:44|[0.15028901734104...|       0|
|  2019-01-01 00:00:51|[0.52745664739884...|       0|
|  2019-01-01 00:01:16|[0.05491329479768...|       0|
|  2019-01-01 00:03:06|[0.75144508670520...|       0|
+---------------------+--------------------+--------+
only showing top 5 rows



In [48]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Logistic Regression Model
logistic_regressor = LogisticRegression(
    featuresCol='scaled_features', 
    labelCol='is_fraud'
)

# Use `BinaryClassificationEvaluator` instead of `RegressionEvaluator`.
evaluator = BinaryClassificationEvaluator(
    labelCol='is_fraud', 
    rawPredictionCol='prediction', 
    metricName='areaUnderROC'
)

paramGrid = ParamGridBuilder() \
    .addGrid(logistic_regressor.regParam, [0.001, 0.01, 0.1]) \
    .addGrid(logistic_regressor.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(logistic_regressor.maxIter, [50, 100, 300]) \
    .addGrid(logistic_regressor.tol, [1e-6, 1e-4, 1e-2]) \
    .build()

In [None]:
# Cross Validation with numFolds=3
crossval = CrossValidator(
    estimator=logistic_regressor,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

# Train the best model on the training set.
cv_model = crossval.fit(train_set)

# Retrieve the best model from Cross Validation.
best_model = cv_model.bestModel

print("Best Model Params:")
print("  Regularization Param (regParam):", best_model.getRegParam())
print("  ElasticNet Param (elasticNetParam):", best_model.getElasticNetParam())
print("  Maximum Iterations (maxIter):", best_model.getMaxIter())
print("  Tolerance (tol):", best_model.getTol())
print("  Threshold:", best_model.getThreshold())