# I. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import torch
import os
from geopy.distance import geodesic

# Data Processing
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Modeling
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, mean,expr, avg, stddev
from pyspark.sql.functions import lag, coalesce, lit
from pyspark.sql.functions import to_date, date_format, to_timestamp
from pyspark.sql.window import Window
from pyspark.sql.functions import col, unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

ModuleNotFoundError: No module named 'torch'

In [2]:
print(torch.__version__)  # Check pytorch version
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.5.1+cu121
True
NVIDIA GeForce RTX 3050 Ti Laptop GPU


# II. Data Invetigating and Cleaning

In [5]:
# Data paths
train_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTrain.csv'
test_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTest.csv'   

In [6]:
# Load the data
df_train = pd.read_csv(train_path, low_memory=False, index_col=0)
df_test = pd.read_csv(test_path, low_memory=False, index_col=0)

# Merge data
df = pd.concat([df_train, df_test],ignore_index=True)

# IV. Feature Engineering

In [59]:
# Transforming the transaction date to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df["dob"] = pd.to_datetime(df["dob"])

In [62]:
# Drop unnecessary columns for modeling
df = df.drop(['first', 'last', 'street', 'city', 'state', 'zip', 'trans_num'],axis=1)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0


In [74]:
# Calculate the age of the cardholder
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

# Extract hour, day, and month
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['month'] = df['trans_date_trans_time'].dt.month

In [None]:
# Calculate the distance between home and merchant locations
#df['distance'] = df.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)

from pyspark.sql.functions import udf
from geopy.distance import geodesic

@udf("double")
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

df = df.withColumn("distance", calculate_distance(df.lat, df.long, df.merch_lat, df.merch_long))

Exception ignored in: <function JavaWrapper.__del__ at 0x000001AF9B12A700>
Traceback (most recent call last):
  File "d:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\venv\lib\site-packages\pyspark\ml\wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x000001AF9B12A700>
Traceback (most recent call last):
  File "d:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\venv\lib\site-packages\pyspark\ml\wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'


In [76]:
# Amount vs average amount by category
df['amt_vs_category_avg'] = df['amt'] / df.groupby('category')['amt'].transform('mean')

In [77]:
df = df.drop(columns=['day', 'dob'])
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,hour,month,day_of_week,distance,amt_vs_category_avg
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,31,0,1,1,48.947783,0.061984
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,41,0,1,1,18.775736,0.919323
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,57,0,1,1,67.172035,3.431553
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,52,0,1,1,59.455974,0.708915
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,33,0,1,1,48.28203,0.66947


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   merchant               object        
 3   category               object        
 4   amt                    float64       
 5   gender                 object        
 6   lat                    float64       
 7   long                   float64       
 8   city_pop               int64         
 9   job                    object        
 10  unix_time              int64         
 11  merch_lat              float64       
 12  merch_long             float64       
 13  is_fraud               int64         
 14  age                    int32         
 15  hour                   int32         
 16  month                  int32         
 17  day_of_week            int32         
 18  distance              

# V. Data Processing

### 1. Encoding object columns

In [79]:
encoder_dir = r"D:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\Encoder"
encoder_path = os.path.join(encoder_dir, "LE_model_v1.pkl")

os.makedirs(encoder_dir, exist_ok=True)

cols = ['merchant', 'category', 'gender', 'job']

def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in cols:
        encoder = LabelEncoder()
        df[col + '_indexer'] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoders, f)
    
    return df

df = encode(df)
df = df.drop(cols, axis=1)

print(f"Encoders saved at: {encoder_path}")

Encoders saved at: D:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\Encoder\LE_model_v1.pkl


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   amt                    float64       
 3   lat                    float64       
 4   long                   float64       
 5   city_pop               int64         
 6   unix_time              int64         
 7   merch_lat              float64       
 8   merch_long             float64       
 9   is_fraud               int64         
 10  age                    int32         
 11  hour                   int32         
 12  month                  int32         
 13  day_of_week            int32         
 14  distance               float64       
 15  amt_vs_category_avg    float64       
 16  merchant_indexer       int64         
 17  category_indexer       int64         
 18  gender_indexer        

### 2. Splitting Data intro Training and Testing sets

In [81]:
def train_test_split(df, test_size=0.2):
    # Shuffle the dataframe
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Calculate the number of test samples
    test_count = int(len(df) * test_size)
    
    # Split the dataframe
    df_train = df[:-test_count]
    df_test = df[-test_count:]
    
    return df_train, df_test

# Perform the split
df_train, df_test = train_test_split(df, test_size=0.2)

# Display the shapes of the resulting dataframes
print(f"Training set shape: {df_train.shape}")
print(f"Testing set shape: {df_test.shape}")

Training set shape: (1481916, 20)
Testing set shape: (370478, 20)


In [82]:
feature_cols = [col for col in df.columns if col not in ["trans_date_trans_time", "is_fraud"]]
target_col = "is_fraud"

In [83]:
X_train = df_train[feature_cols]
y_train = df_train[target_col]

X_test = df_test[feature_cols]
y_test = df_test[target_col]

### 3. OverSampling (Process Imbalanced Data)

The rule of thumb is: never mess up with your test set. Always split into test and train sets BEFORE trying oversampling/undersampling techniques!

Oversampling before splitting the data can allow the exact same observations to be present in both the test and train sets. This can allow model to simply memorize specific data points and cause overfitting and poor generalization to the test data. Data leakage can cause you to create overly optimistic if not completely invalid predictive models.

![](https://dataaspirant.com/wp-content/uploads/2020/08/10-oversampling.png)
Picture Credit: https://dataaspirant.com

In [84]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('Feature/label dataset for training before applying SMOTE: ', X_train.shape, y_train.shape)
print('Feature/label dataset for training after applying SMOTE: ', X_train_smote.shape, y_train_smote.shape)
print('Distribution of label values after applying SMOTE:\n',pd.Series(y_train_smote).value_counts())



Feature/label dataset for training before applying SMOTE:  (1481916, 18) (1481916,)
Feature/label dataset for training after applying SMOTE:  (2948444, 18) (2948444,)
Distribution of label values after applying SMOTE:
 is_fraud
0    1474222
1    1474222
Name: count, dtype: int64


# VI. Building Model (Spark ML)

In [85]:
# Initialize Spark session
spark = SparkSession.builder.appName("Streaming Fraud Detection") \
        .config("spark.executor.memory", "16g").config("spark.executor.cores", "4") \
        .config("spark.task.cpus", "1").config("spark.driver.memory", "8g") \
        .config("spark.driver.cores", "4").config("spark.executor.resource.gpu.amount", "1") \
        .config("spark.executor.resource.gpu.discoveryScript", "/usr/bin/nvidia-smi") \
        .config("spark.rapids.sql.enabled", "true") \
        .config("spark.rapids.memory.pinnedPool.size", "2G") \
        .config("spark.sql.shuffle.partitions", "200") \
        .getOrCreate()

In [86]:
pandas_df = pd.DataFrame(X_train_smote, columns=X_train.columns)
pandas_df['is_fraud'] = y_train_smote

In [87]:
pandas_df.head()

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,age,hour,month,day_of_week,distance,amt_vs_category_avg,merchant_indexer,category_indexer,gender_indexer,job_indexer,is_fraud
0,5359543825610251,59.91,45.7801,-111.1439,18182,1379488419,45.274075,-111.649432,45,7,9,4,42.701449,0.943802,285,2,1,163,0
1,5540636818935089,3.96,42.6911,-71.1605,76383,1386265705,43.356278,-71.008959,35,17,12,5,46.554334,0.050186,284,12,1,216,0
2,2720894374956739,51.17,42.5978,-82.8823,16305,1339759484,42.372483,-83.50802,92,11,6,5,35.549179,0.806115,24,2,0,374,0
3,6011438889172900,2.06,34.2853,-91.3336,5161,1377816625,33.833389,-91.158293,27,22,8,5,32.731,0.026107,531,12,0,147,0
4,60495593109,6.58,32.7699,-96.743,1263321,1343231435,32.458643,-96.577001,77,15,7,3,23.53195,0.05887,318,13,1,460,0


In [88]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2948444 entries, 0 to 2948443
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   cc_num               int64  
 1   amt                  float64
 2   lat                  float64
 3   long                 float64
 4   city_pop             int64  
 5   unix_time            int64  
 6   merch_lat            float64
 7   merch_long           float64
 8   age                  int32  
 9   hour                 int32  
 10  month                int32  
 11  day_of_week          int32  
 12  distance             float64
 13  amt_vs_category_avg  float64
 14  merchant_indexer     int64  
 15  category_indexer     int64  
 16  gender_indexer       int64  
 17  job_indexer          int64  
 18  is_fraud             int64  
dtypes: float64(7), int32(4), int64(8)
memory usage: 382.4 MB


In [89]:
spark_df = spark.createDataFrame(pandas_df)

In [None]:
feature_cols = [col for col in spark_df.columns if col != "is_fraud"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

spark_df = assembler.transform(spark_df).select("features", "label")

IllegalArgumentException: Output column features already exists.

In [93]:
train_data, val_data = spark_df.randomSplit([0.8, 0.2], seed=42)

In [96]:
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_fraud",
    numTrees=100,        
    maxDepth=5,        
    impurity="gini",        
    seed=42,
    subsamplingRate=0.8   
)

In [95]:
model = rf.fit(train_data)

IllegalArgumentException: label does not exist. Available: cc_num, amt, lat, long, city_pop, unix_time, merch_lat, merch_long, age, hour, month, day_of_week, distance, amt_vs_category_avg, merchant_indexer, category_indexer, gender_indexer, job_indexer, is_fraud, features