# Credit Card Fraud Processing using PySpark
This notebook loads and processes raw JSON data related to credit card transactions and prepares it for fraud analysis.

In [None]:
# Import required PySpark modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Create a SparkSession
spark = SparkSession.builder.appName("CreditCardFraudProcessing").getOrCreate()

## Load Raw JSON Data
We load the raw JSON data and inspect its schema and a few sample records.

In [None]:
# Load the raw JSON file
df_raw = spark.read.json("path/to/data.json")  # Replace with actual path
df_raw.printSchema()
df_raw.show(3, truncate=False)

## Parse `personal_detail` JSON String
The `personal_detail` field contains nested JSON, so we parse it using a defined schema.

In [None]:
# Define schema for personal_detail
personal_schema = StructType([
    StructField("person_name", StringType()),
    StructField("gender", StringType()),
    StructField("address", StringType()),
    StructField("lat", StringType()),
    StructField("long", StringType()),
    StructField("city_pop", StringType()),
    StructField("job", StringType()),
    StructField("dob", StringType())
])

# Parse the personal_detail JSON string
df = df_raw.withColumn("personal_detail_json", from_json("personal_detail", personal_schema)).drop("personal_detail")

## Parse Nested Address Field
We further parse the `address` field within `personal_detail`.

In [None]:
# Define address schema
address_schema = StructType([
    StructField("street", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("zip", StringType())
])

# Parse the address field
df = df.withColumn("address_json", from_json("personal_detail_json.address", address_schema))

## Flatten Nested Fields
We extract and flatten the relevant fields to prepare for analysis.

In [None]:
# Extract and flatten fields
df = df \
    .withColumn("first", split(col("personal_detail_json.person_name"), "[,@/]")[0]) \
    .withColumn("last", split(col("personal_detail_json.person_name"), "[,@/]")[1]) \
    .withColumn("gender", col("personal_detail_json.gender")) \
    .withColumn("dob", col("personal_detail_json.dob")) \
    .withColumn("street", col("address_json.street")) \
    .withColumn("city", col("address_json.city")) \
    .withColumn("state", col("address_json.state")) \
    .withColumn("zip", col("address_json.zip")) \
    .withColumn("lat", col("personal_detail_json.lat").cast("double")) \
    .withColumn("long", col("personal_detail_json.long").cast("double")) \
    .withColumn("city_pop", col("personal_detail_json.city_pop").cast("int")) \
    .withColumn("job", col("personal_detail_json.job")) \
    .drop("personal_detail_json", "address_json")

## Convert Epoch Timestamps
Convert merchant-related timestamps from epoch microseconds to readable format.

In [None]:
# Convert epoch microseconds to timestamp
def convert_epoch_microseconds(colname):
    return from_utc_timestamp((col(colname) / 1000000).cast("timestamp"), "Asia/Kuala_Lumpur")

df = df \
    .withColumn("trans_date_trans_time", to_timestamp("trans_date_trans_time")) \
    .withColumn("merch_last_update_time", convert_epoch_microseconds("merch_last_update_time")) \
    .withColumn("merch_eff_time", convert_epoch_microseconds("merch_eff_time"))

## Mask Credit Card Numbers
Apply SHA-256 to protect sensitive credit card numbers.

In [None]:
# Mask sensitive credit card number
df = df.withColumn("cc_num_masked", sha2(col("cc_num"), 256))

## Show Sample Processed Records

In [None]:
df.select("first", "last", "gender", "amt", "category", "is_fraud").show(5)

## Visualize Fraud Distribution
We use Seaborn to visualize fraud counts by merchant category.

In [None]:
# Convert to Pandas for visualization
pdf = df.select("category", "amt", "is_fraud").toPandas()

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
sns.countplot(data=pdf, x="category", hue="is_fraud")
plt.title("Fraud Cases by Category")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()