In [1]:
#  ЧАСТИНА 2: Обробка в Apache Spark (Structured Streaming)

# ✅ 1. Зчитування з Kafka

# * Використовуйте Spark Structured Streaming для підключення до локального Kafka сluster:
#   * Читання з двох Kafka-топіків: transactions та user_activity.

# ✅ 2. Обробка транзакцій
# * Перетворіть дані з Kafka в DataFrame з колонками (transaction_id, user_id, amount, merchant, timestamp, is_fraud).
# * Встановіть правильний тип поля timestamp з cast або to_timestamp.

# ✅ 3. Аналіз user_activity

# * Зчитуйте топік user_activity аналогічно.
# * Проведіть базову класифікацію по event_type (click, add_to_cart, purchase) і побудуйте кількість подій по user_id у певний часовий період (10 хвилин).
# * Використовуйте sliding window aggregation.

# ✅ 4. Fraud Detection

# * Визначіть просту бізнес-логіку шахрайства:
# * транзакції понад $1000
# * merchant = “Amazon”
# * is_fraud == true
# * Запишіть результати локально.

# ✅ 5. Join потоків

# * Реалізуйте join між transactions_df та user_activity_df по user_id (використовуйте результат 2 та 3 кроків)
# * Застосуйте time-range join (joinExpr має враховувати timestamp ±5 хвилин).
# * Не забувайте використати watermark-и на обох потоках.
# * Запишіть результат локально.

# ✅ 6. (Опційно) Дедуплікація подій

# Для потоку user_activity використайте dropDuplicates(["event_id"]), щоб уникнути повторів.

In [2]:
from pyspark.sql import SparkSession


KAFKA_BOOTSTRAP_SERVERS = "127.0.0.1:9092"
TRANSACTION_TOPIC_NAME = "transactions"
ACTIVITY_TOPIC_NAME = "user-activity"



In [3]:
# * Використовуйте Spark Structured Streaming для підключення до локального Kafka сluster:
#   * Читання з двох Kafka-топіків: transactions та user_activity.

spark = SparkSession.builder \
    .appName("lavreniuk-hw5") \
    .config("spark.jars.packages", ",".join((
        "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0",
        "org.apache.spark:spark-avro_2.13:4.0.0",
        "io.confluent:kafka-schema-registry-client:7.5.0",
        "io.confluent:kafka-avro-serializer:7.4.1",
    ))) \
    .config("spark.jars.repositories", "https://packages.confluent.io/maven/") \
    .getOrCreate()

# Because of Mac Silicon use additional config for package below

spark.sparkContext.setLogLevel("WARN")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/06 18:23:33 WARN Utils: Your hostname, Air-M4.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.46 instead (on interface en0)
25/09/06 18:23:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
https://packages.confluent.io/maven/ added as a remote repository with the name: repo-1
:: loading settings :: url = jar:file:/Users/tenpenny/Projects/rd/lavreniuk-hw5/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/tenpenny/.ivy2.5.2/cache
The jars for the packages stored in: /Users/tenpenny/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
org.apache.spark#spark-avro_2.13 added as a dependency
io.confluent#kafka-schema-registry-client added as a dependency
io.confluent#kafka-avro-serializer added as a dependency
:: resolving dependencies :: org.apache.

In [4]:
# * Перетворіть дані з Kafka в DataFrame з колонками (transaction_id, user_id, amount, merchant, timestamp, is_fraud).
# * Встановіть правильний тип поля timestamp з cast або to_timestamp.


from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, TimestampType
from pyspark.sql.avro.functions import from_avro

# Виникає помилка з Avro через сумісність (пробував різні методи, додав скріншоти. Перейшов на json щоб уникнути проблеми. В такому випадку переведення у timestamp вручну не потрібне, оскільки є схема)
transactions_schema = StructType([
    StructField("transaction_id", StringType()),
    StructField("user_id", StringType()),
    StructField("amount", FloatType()),
    StructField("merchant", StringType()),
    StructField("currency", StringType()),
    StructField("timestamp", TimestampType()),
    StructField("is_fraud", BooleanType())
])

df_transaction = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", TRANSACTION_TOPIC_NAME) \
    .load()

string_transaction_df = df_transaction.selectExpr("CAST(value AS STRING) as message")

parsed_transaction_df = string_transaction_df.select(from_json(col("message"), transactions_schema).alias("data")).select("data.*")

parquet_transaction = parsed_transaction_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/transaction") \
    .option("checkpointLocation", "checkpoints/transaction") \
    .start()


25/09/06 18:23:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [5]:
parsed_transaction_df.printSchema() # Приводити типи не потрібно, вони вже існують

root
 |-- transaction_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- merchant: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- is_fraud: boolean (nullable = true)



                                                                                

In [6]:
# * Зчитуйте топік user_activity аналогічно.
# * Проведіть базову класифікацію по event_type (click, add_to_cart, purchase) і побудуйте кількість подій по user_id у певний часовий період (10 хвилин).
# * Використовуйте sliding window aggregation.

activity_schema = StructType([
    StructField("event_id", StringType()),
    StructField("user_id", StringType()),
    StructField("event_type", StringType()),
    StructField("device", StringType()),
    StructField("browser", StringType()),
    StructField("timestamp", TimestampType())
])

df_activity = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", ACTIVITY_TOPIC_NAME) \
    .load()

string_activity_df = df_activity.selectExpr("CAST(value AS STRING) as message")

parsed_activity_df = string_activity_df.select(from_json(col("message"), activity_schema).alias("data")).select("data.*")

query_activity = parsed_activity_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/activity") \
    .option("checkpointLocation", "checkpoints/activity") \
    .start()

25/09/06 18:24:18 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [None]:
from pyspark.sql.functions import window, count as s_count

event_type_classification_df = (
    parsed_activity_df.withWatermark("timestamp", "10 minutes")
    .groupBy(
        col("user_id"),
        col("event_type"),
        window(col("timestamp"), "10 minutes", "5 minutes")
    ).agg(
        s_count("*").alias("event_count")
    )
)

# Debug
console_event_query = event_type_classification_df.writeStream \
    .outputMode("update") \
    .format("console") \
    .option("truncate", "false") \
    .start()

# Save data
parquet_event_query = event_type_classification_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/event_type") \
    .option("checkpointLocation", "checkpoints/event_type") \
    .start()

25/09/06 18:25:05 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-7e05d3e4-a9cb-409b-b4fc-040b6833d17e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/06 18:25:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/09/06 18:25:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+----------+------+-----------+
|user_id|event_type|window|event_count|
+-------+----------+------+-----------+
+-------+----------+------+-----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|7cb4ab7d-f3a9-4300-9926-4957005dcb50|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|7cb4ab7d-f3a9-4300-9926-4957005dcb50|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|713efed3-59e9-4a39-a60d-88085554f63b|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|713efed3-59e9-4a39-a60d-88085554f63b|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|867bad30-a5ed-4601-b0fd-5341fae46d94|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|867bad30-a5ed-4601-b0fd-5341fae46d94|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|4121b838-f3f7-4e94-ba99-2bb46b0ee4db|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|4121b838-f3f7-4e94-ba99-2bb46b0ee4db|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|923356e0-69b6-4ea1-9c6b-4fec957b5e3e|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|923356e0-69b6-4ea1-9c6b-4fec957b5e3e|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|0e01e2e4-4f8f-479c-9644-a411dbd52f57|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|0e01e2e4-4f8f-479c-9644-a411dbd52f57|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|7153448e-9e34-463c-a647-fd83a605c626|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|7153448e-9e34-463c-a647-fd83a605c626|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|167f422b-d397-4ac9-90a8-3227eee87789|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|167f422b-d397-4ac9-90a8-3227eee87789|view      |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+-------+----------+------+-----------+
|user_id|event_type|window|event_count|
+-------+----------+------+-----------+
+-------+----------+------+-----------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|d0e5b1ec-04cc-4ace-9609-a34e4e97a657|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|d0e5b1ec-04cc-4ace-9609-a34e4e97a657|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|cc1e9350-32fe-46ea-aaad-b483f64b0c23|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|cc1e9350-32fe-46ea-aaad-b483f64b0c23|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+-------+----------+------+-----------+
|user_id|event_type|window|event_count|
+-------+----------+------+-----------+
+-------+----------+------+-----------+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|b92a7cf1-f355-4c5a-ac94-e539bb1c3baa|view      |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|b92a7cf1-f355-4c5a-ac94-e539bb1c3baa|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|a7967808-b580-49c6-a183-0cbaafa67b03|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|a7967808-b580-49c6-a183-0cbaafa67b03|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+-------+----------+------+-----------+
|user_id|event_type|window|event_count|
+-------+----------+------+-----------+
+-------+----------+------+-----------+



                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|487cd075-e0e9-4c27-ba7b-9e065cf1503b|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|487cd075-e0e9-4c27-ba7b-9e065cf1503b|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|758c8432-4e19-4c58-b004-e2d1e149fdca|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|758c8432-4e19-4c58-b004-e2d1e149fdca|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 18
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|22e0961f-abd7-466b-91cb-6bcf121eaa69|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|22e0961f-abd7-466b-91cb-6bcf121eaa69|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+





-------------------------------------------
Batch: 19
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|522cfcb5-8c92-4bb0-9b66-0918b7766aa7|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|522cfcb5-8c92-4bb0-9b66-0918b7766aa7|view      |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 20
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|02142079-22ba-41b3-8ae7-ccb03feefc87|purchase  |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|02142079-22ba-41b3-8ae7-ccb03feefc87|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 21
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|a9e5319d-b2cb-4da8-89c9-20b89d3e8ccc|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|a9e5319d-b2cb-4da8-89c9-20b89d3e8ccc|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 22
-------------------------------------------
+-------+----------+------+-----------+
|user_id|event_type|window|event_count|
+-------+----------+------+-----------+
+-------+----------+------+-----------+



                                                                                

-------------------------------------------
Batch: 23
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|f0fd1050-7983-4474-98ab-7829f251eba0|add_to_cart|{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|f0fd1050-7983-4474-98ab-7829f251eba0|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 24
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|670f8d4c-52f8-44ee-853f-b19b1fea2dcc|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|670f8d4c-52f8-44ee-853f-b19b1fea2dcc|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 25
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|3287a460-9df8-46a6-a4f4-70553d393bdf|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|a1cf47cd-1218-430c-b502-d58e16a12a11|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|3287a460-9df8-46a6-a4f4-70553d393bdf|click     |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|a1cf47cd-1218-430c-b502-d58e16a12a11|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 26
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|419ce698-7696-4f8d-9145-f4cd5b91577a|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|419ce698-7696-4f8d-9145-f4cd5b91577a|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 27
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|a82d1648-abec-4f34-8cb3-d413697f207f|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|a82d1648-abec-4f34-8cb3-d413697f207f|click     |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 28
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|3429ecf7-9390-403d-97d6-458dc750eea3|view      |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|f7dfc031-1848-454f-9b0a-17efd0590298|click     |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|3429ecf7-9390-403d-97d6-458dc750eea3|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|f7dfc031-1848-454f-9b0a-17efd0590298|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+





-------------------------------------------
Batch: 29
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|6bb7e09a-ac7a-4485-9ae4-a2250bdd1376|purchase  |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|6bb7e09a-ac7a-4485-9ae4-a2250bdd1376|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 30
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|f8df17a5-d3b6-498e-badb-d0ac691f407e|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|f8df17a5-d3b6-498e-badb-d0ac691f407e|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 31
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|74b3c02b-6529-454c-8527-0ac604a175c0|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|63e9a75a-75c9-4d47-b1ed-ea26da723635|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|74b3c02b-6529-454c-8527-0ac604a175c0|click     |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|63e9a75a-75c9-4d47-b1ed-ea26da723635|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 32
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|c723a9db-89cd-4ca9-9c38-2b23c1f36800|purchase  |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|c723a9db-89cd-4ca9-9c38-2b23c1f36800|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 33
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|201079fb-2b16-48b9-9a4a-c960f132466d|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|201079fb-2b16-48b9-9a4a-c960f132466d|purchase  |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 34
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|b8bbdb6c-11a8-46ea-b45f-ca63a1a363f3|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|32d46f04-e1aa-4160-9891-83207beeccab|purchase   |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|b8bbdb6c-11a8-46ea-b45f-ca63a1a363f3|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|32d46f04-e1aa-4160-9891-83207beeccab|purchase   |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 35
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|d11dc546-e6cf-4609-8076-a2edabc49eeb|purchase   |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|decfb8f7-aec9-41a9-9d23-65e9e37f1054|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|d11dc546-e6cf-4609-8076-a2edabc49eeb|purchase   |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|decfb8f7-aec9-41a9-9d23-65e9e37f1054|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 36
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|8daffd38-bf0f-4d26-8b66-c9fb260059fb|view      |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
|5e1d1683-e827-4242-997e-3c970c9b349d|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|8daffd38-bf0f-4d26-8b66-c9fb260059fb|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|5e1d1683-e827-4242-997e-3c970c9b349d|click     |{2025-09-06 18:15:00, 2025-09-06 18:25:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 37
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|e5a377fb-eecf-40d5-b8b0-f66540022b9e|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|e5a377fb-eecf-40d5-b8b0-f66540022b9e|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 38
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|965b88b5-d8a5-4d78-b1e4-bcca04fa2471|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|965b88b5-d8a5-4d78-b1e4-bcca04fa2471|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 39
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|cf271e8a-1ad8-44bf-82d5-1825861fcf41|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|591b6a42-5136-4f56-9874-adc821dfc534|click     |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|591b6a42-5136-4f56-9874-adc821dfc534|click     |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|cf271e8a-1ad8-44bf-82d5-1825861fcf41|view      |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 40
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|df5c5523-b2d4-4158-b8a4-0ef8f15729bf|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|df5c5523-b2d4-4158-b8a4-0ef8f15729bf|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 41
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|fa3ee956-bd06-49fb-8976-f72cffd86027|purchase  |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|fa3ee956-bd06-49fb-8976-f72cffd86027|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 42
-------------------------------------------
+------------------------------------+----------+------------------------------------------+-----------+
|user_id                             |event_type|window                                    |event_count|
+------------------------------------+----------+------------------------------------------+-----------+
|ae4600ea-7155-4116-b316-e28c79299ac1|view      |{2025-09-06 18:30:00, 2025-09-06 18:40:00}|1          |
|1d52e4df-0673-4ce0-b610-6ec472b08277|purchase  |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|ae4600ea-7155-4116-b316-e28c79299ac1|view      |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|1d52e4df-0673-4ce0-b610-6ec472b08277|purchase  |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+----------+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 43
-------------------------------------------
+------------------------------------+-----------+------------------------------------------+-----------+
|user_id                             |event_type |window                                    |event_count|
+------------------------------------+-----------+------------------------------------------+-----------+
|b0e1428b-e8d2-4014-b913-925e6a0f4ebe|view       |{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|70e9dcb3-3dad-4e27-a959-b3dd5f333d57|add_to_cart|{2025-09-06 18:20:00, 2025-09-06 18:30:00}|1          |
|70e9dcb3-3dad-4e27-a959-b3dd5f333d57|add_to_cart|{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
|b0e1428b-e8d2-4014-b913-925e6a0f4ebe|view       |{2025-09-06 18:25:00, 2025-09-06 18:35:00}|1          |
+------------------------------------+-----------+------------------------------------------+-----------+





In [8]:
console_event_query.stop() # end debug, just only parquet left

25/09/06 18:28:06 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 48, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
25/09/06 18:28:06 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 48, writer: ConsoleWriter[numRows=20, truncate=false]] aborted.
25/09/06 18:28:08 WARN HDFSBackedStateStoreProvider: Error doing snapshots for HDFSStateStoreProvider[id = (op=0,part=124),dir = file:/private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-7e05d3e4-a9cb-409b-b4fc-040b6833d17e/state/0/124]
java.lang.IllegalStateException: Unexpected list of delta files for version 47 for HDFSStateStoreProvider[id = (op=0,part=124),dir = file:/private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-7e05d3e4-a9cb-409b-b4fc-040b6833d17e/state/0/124]: List(StoreFile(34,file:/private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-7e05d3e4-a9cb-409b-b4fc-040b6833d17e/state/0/124/34.delta,false), StoreFile

                                                                                

In [11]:
# * Визначіть просту бізнес-логіку шахрайства:
# * транзакції понад $1000
# * merchant = “Amazon”
# * is_fraud == true
# * Запишіть результати локально.

fraud_df = parsed_transaction_df.filter(
    (col("amount") > 1000) &
    (col("merchant").isin("Amazon", "Paypal")) & # Amazon will be never happen, because our datagen don`t have it, so I added Paypal additional
    # (col("currency") == "USD") & # Comment this line, because it happens once a year, lets see on all currency
    (col("is_fraud") == True)
)

fraud_console_query = fraud_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

fraud_parquet_query = fraud_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/suspicious_trans") \
    .option("checkpointLocation", "checkpoints/suspicious_trans") \
    .start()

25/09/06 18:43:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-68164981-6566-499d-9523-276974daea25. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/06 18:43:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/09/06 18:43:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|
+--------------+-------+------+--------+--------+---------+--------+
+--------------+-------+------+--------+--------+---------+--------+



                                                                                ]

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|
+--------------+-------+------+--------+--------+---------+--------+
+--------------+-------+------+--------+--------+---------+--------+



                                                                                ]

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+--------------------+------+--------+--------+--------------------+--------+
|      transaction_id|             user_id|amount|merchant|currency|           timestamp|is_fraud|
+--------------------+--------------------+------+--------+--------+--------------------+--------+
|02be4e06-784d-495...|5620134c-ffec-435...|1161.1|  Paypal|     LBO|2025-09-06 18:44:...|    true|
+--------------------+--------------------+------+--------+--------+--------------------+--------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|
+--------------+-------+------+--------+--------+---------+--------+
+--------------+-------+------+--------+--------+---------+--------+



                                                                                

In [12]:
fraud_console_query.stop()

25/09/06 18:43:26 WARN DAGScheduler: Failed to cancel job group e80b2461-e86a-41c1-8902-623e1d5675af. Cannot find active jobs for it.
25/09/06 18:43:26 WARN DAGScheduler: Failed to cancel job group e80b2461-e86a-41c1-8902-623e1d5675af. Cannot find active jobs for it.


                                                                                ]

In [None]:
# * Реалізуйте join між transactions_df та user_activity_df по user_id (використовуйте результат 2 та 3 кроків)
# * Застосуйте time-range join (joinExpr має враховувати timestamp ±5 хвилин).
# * Не забувайте використати watermark-и на обох потоках.
# * Запишіть результат локально.
from pyspark.sql.functions import expr

transactions_wm_df = parsed_transaction_df.withWatermark("timestamp", "10 minutes")
user_activity_wm_df = parsed_activity_df.withWatermark("timestamp", "10 minutes")

unioned_df = transactions_wm_df.join(
    user_activity_wm_df,
    on=(
        (transactions_wm_df.user_id == user_activity_wm_df.user_id) &
        (user_activity_wm_df.timestamp.between(
            transactions_wm_df.timestamp - expr("INTERVAL 5 MINUTES"),
            transactions_wm_df.timestamp + expr("INTERVAL 5 MINUTES")
        ))
    ),
    how="inner"
)

unioned_console_query = unioned_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

unioned_parquet_query = unioned_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/denormalized") \
    .option("checkpointLocation", "checkpoints/denormalized") \
    .start()

25/09/06 18:51:14 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/9p/qrvh0mgd1536kkkrlb27q41m0000gn/T/temporary-644d63a1-d8ee-47bf-a7a2-015562d71026. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/06 18:51:14 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/09/06 18:51:15 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                ]

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|event_id|user_id|event_type|device|browser|timestamp|
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+



                                                                                1]

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|event_id|user_id|event_type|device|browser|timestamp|
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+



                                                                                ]

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
|transaction_id|user_id|amount|merchant|currency|timestamp|is_fraud|event_id|user_id|event_type|device|browser|timestamp|
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+
+--------------+-------+------+--------+--------+---------+--------+--------+-------+----------+------+-------+---------+



[Stage 2205:(188 + 10) / 200][Stage 2210:(0 + 0) / 200][Stage 2211:> (0 + 0) / 1]

In [14]:
unioned_console_query.stop()

25/09/06 18:52:38 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
25/09/06 18:52:38 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.
25/09/06 18:52:40 WARN DAGScheduler: Failed to cancel job group 5a31e0ce-ddbc-4ca5-bda2-d992ceffeeb7. Cannot find active jobs for it.


                                                                                ]]

In [15]:
# Для потоку user_activity використайте dropDuplicates(["event_id"]), щоб уникнути повторів.

deduplicated_user_activity_df = parsed_activity_df.withWatermark("timestamp", "1 hour").dropDuplicates(["event_id"]) # Use bigger watermark cause of deduplication strategy


deduplicated_user_activity_parquet_query = deduplicated_user_activity_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "data/deduplicated") \
    .option("checkpointLocation", "checkpoints/deduplicated") \
    .start()

25/09/06 19:01:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
[Stage 2601:(135 + 10) / 200][Stage 2605:(0 + 0) / 200][Stage 2607:> (0 + 0) / 1]

[Stage 2646:(114 + 10) / 200][Stage 2649:(0 + 0) / 200][Stage 2650:> (0 + 0) / 1]]