In [3]:
%run common.ipynb

In [4]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("FraudDetection") \
    .config("spark.jars.packages", "mysql:mysql-connector-java:8.0.33") \
    .getOrCreate()



In [5]:


# Read user_logins and orders data from MySQL or other sources
user_logins_df = spark.read.jdbc(url=USER_MYSQL_URL, table="LoginHistory", properties=MYSQL_PROPERTIES)
orders_df = spark.read.jdbc(url=ORDER_MYSQL_URL, table="orders", properties=MYSQL_PROPERTIES)

# Register DataFrames as temporary views for Spark SQL
user_logins_df.createOrReplaceTempView("user_logins")
orders_df.createOrReplaceTempView("orders")


In [6]:


query = """
WITH suspicious_logins AS (
    SELECT
        customer_id,
        COUNT(DISTINCT ip_address) AS unique_ips,
        COUNT(*) AS total_attempts
    FROM user_logins
    WHERE login_date >= date_add(current_date(), -30)
    GROUP BY customer_id
    HAVING unique_ips > 3 OR total_attempts > 10
),
high_risk_orders AS (
    SELECT
        customer_id,
        COUNT(order_id) AS order_count,
        SUM(total_amount) AS total_spent
    FROM orders
    WHERE order_date >= date_add(current_date(), -30)
    GROUP BY customer_id
    HAVING total_spent > 5000 OR order_count > 5
)
SELECT DISTINCT s.customer_id
FROM suspicious_logins s
JOIN high_risk_orders h ON s.customer_id = h.customer_id
"""

# Execute the query
high_risk_customers = spark.sql(query)
# Show result
high_risk_customers.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----------+
|customer_id|
+-----------+
+-----------+



                                                                                

In [7]:
from pyspark.sql import functions as F

# Define the date range (last 30 days)
date_threshold = F.date_add(F.current_date(), -30)

# Identify suspicious logins
suspicious_logins = (
    user_logins_df.filter(F.col("login_date") >= date_threshold)
    .groupBy("customer_id")
    .agg(
        F.countDistinct("ip_address").alias("unique_ips"),
        F.count("*").alias("total_attempts")
    )
    .filter((F.col("unique_ips") > 3) | (F.col("total_attempts") > 10))
)

# Identify high-risk orders
high_risk_orders = (
    orders_df.filter(F.col("order_date") >= date_threshold)
    .groupBy("customer_id")
    .agg(
        F.count("order_id").alias("order_count"),
        F.sum("total_amount").alias("total_spent")
    )
    .filter((F.col("total_spent") > 5000) | (F.col("order_count") > 5))
)

# Join both datasets to find high-risk
suspicious_logins.show()
high_risk_orders.show()

# Perform the JOIN operation
suspicious_customers_df = suspicious_logins.join(
    high_risk_orders, 
    on="customer_id", 
    how="inner"
).select("customer_id").distinct()

# Show results
suspicious_customers_df.show()

+-----------+----------+--------------+
|customer_id|unique_ips|total_attempts|
+-----------+----------+--------------+
+-----------+----------+--------------+

+-----------+-----------+-----------+
|customer_id|order_count|total_spent|
+-----------+-----------+-----------+
|       7173|          6|    1581.62|
|       8449|          7|    1694.48|
|       9998|          6|    1158.96|
|       9360|          6|    1217.78|
|       6611|          6|    1524.32|
|       2526|          6|    1523.10|
|       9927|          6|    1826.37|
|        395|          6|    2052.64|
|       1038|          6|    1828.55|
|       6736|          6|    2087.65|
|       5755|          6|    1759.52|
|       7347|          7|    2028.40|
|       9469|          6|    1406.55|
|       5996|          6|    1563.48|
|       9975|          6|    1656.82|
|        550|          6|    2251.79|
|       6355|          8|    1421.65|
|       5441|          6|    1634.53|
|       4600|          6|    1675.09|
|  

In [6]:
# Define AWS Glue database and table names
glue_database = "customer_analytics"
glue_table = "suspicious_customers"

# Define S3 output path
s3_output_path = "s3://feb2025-training-bucket/analytics/suspicious_customers/"

# Create the AWS Glue Catalog table using the DataFrame
suspicious_customers_df.write \
    .format("parquet") \
    .mode("overwrite") \
    .option("path", s3_output_path) \
    .saveAsTable(f"{glue_database}.{glue_table}")

print(f"Aggregated sales data written to S3: {s3_output_path}")
print(f"Glue table '{glue_database}.{glue_table}' created successfully.")


25/03/03 13:04:32 INFO HiveConf: Found configuration file file:/home/glue_user/spark/conf/hive-site.xml
25/03/03 13:04:33 WARN InstanceMetadataServiceResourceFetcher: Fail to retrieve token 
com.amazonaws.SdkClientException: Failed to connect to service endpoint: 
	at com.amazonaws.internal.EC2ResourceFetcher.doReadResource(EC2ResourceFetcher.java:100)
	at com.amazonaws.internal.InstanceMetadataServiceResourceFetcher.getToken(InstanceMetadataServiceResourceFetcher.java:91)
	at com.amazonaws.internal.InstanceMetadataServiceResourceFetcher.readResource(InstanceMetadataServiceResourceFetcher.java:69)
	at com.amazonaws.internal.EC2ResourceFetcher.readResource(EC2ResourceFetcher.java:66)
	at com.amazonaws.util.EC2MetadataUtils.getItems(EC2MetadataUtils.java:407)
	at com.amazonaws.util.EC2MetadataUtils.getData(EC2MetadataUtils.java:376)
	at com.amazonaws.util.EC2MetadataUtils.getData(EC2MetadataUtils.java:372)
	at com.amazonaws.util.EC2MetadataUtils.getEC2InstanceRegion(EC2MetadataUtils.java

Aggregated sales data written to S3: s3://feb2025-training-bucket/analytics/suspicious_customers/
Glue table 'customer_analytics.suspicious_customers' created successfully.
