# Load

- Import libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, explode, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType
# from pymongo import MongoClient

- Constants

In [None]:
APP_NAME = "KafkaToMongoDB"
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"
KAFKA_TOPIC = "btc-price-zscore"
MONGO_URI = "mongodb://localhost:27017/"
MONGO_DB_NAME = "btc_data"
SPARK_KAFKA_PACKAGE = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2"

- Spark Session

In [None]:
spark = SparkSession.builder \
    .appName(APP_NAME) \
    .config("spark.jars.packages", SPARK_KAFKA_PACKAGE) \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

- Schema

In [None]:
schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("zscore_data", ArrayType(StructType([
        StructField("window", StringType(), True),
        StructField("zscore_price", FloatType(), True)
    ])), True)
])

- Read from Kafka

In [None]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

- Data Processing

In [None]:
# parse the JSON data from Kafka
json_df = kafka_df.selectExpr("CAST(value AS STRING) as value") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# explode the zscore_data array and convert timestamp to a proper format
expanded_df = json_df.withColumn("zscore", explode("zscore_data")) \
    .withColumn("event_time", to_timestamp("timestamp")) \
    .select(
        col("timestamp"),
        col("symbol"),
        col("zscore.window").alias("window"),
        col("zscore.zscore_price").alias("zscore_price"),
        col("event_time")
    )

- Handling late data (10 second)

In [None]:
watermarked_df = expanded_df.withWatermark("event_time", "10 seconds")

- Write to MongoDB

In [None]:
def write_to_mongo(df, epoch_id):
    client = MongoClient(MONGO_URI)
    db = client[MONGO_DB_NAME]
    for row in df.collect():
        collection_name = f"{KAFKA_TOPIC}-{row['window']}"
        document = {
            "timestamp": row["timestamp"],
            "symbol": row["symbol"],
            "window": row["window"],
            "zscore_price": row["zscore_price"]
        }
        db[collection_name].insert_one(document)
        print(f"[Epoch {epoch_id}] Ghi vào {collection_name}: {document}")

- Run

In [None]:
query = watermarked_df.writeStream \
    .foreachBatch(write_to_mongo) \
    .outputMode("append") \
    .start()

query.awaitTermination()
