In [0]:
import random
import string
from pyspark.sql.functions import col, expr, udf
from pyspark.sql.types import StringType

# Definimos algunos IDs
USER_IDS = ["user_001", "user_002", "user_003", "user_004", "user_005"]
TRANSACTION_IDS = ["123", "456", "789", "987", "654", "321"]
TRANSACTION_STATUSES = ["Approved", "Rejected", "Pending"]

# UDF para generar user IDs aleatorios
def generate_user_id():
    return random.choice(USER_IDS)

# UDF para generar transaction IDs aleatorios
def generate_transaction_id():
    return random.choice(TRANSACTION_IDS)

# UDF para generar transaction status aleatorios
def generate_transaction_status():
    return random.choice(TRANSACTION_STATUSES)

# Registramos las UDFs
generate_user_id_udf = udf(generate_user_id, StringType())
generate_transaction_id_udf = udf(generate_transaction_id, StringType())
generate_transaction_status_udf = udf(generate_transaction_status, StringType())

# Stream 1: Simulamos transacciones por usuarios
transactions_stream = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 5) \
    .load() \
    .selectExpr("timestamp AS event_time") \
    .withColumn("user_id", generate_user_id_udf()) \
    .withColumn("transaction_id", generate_transaction_id_udf())

# Stream 2: Simulamos transacciones de status
status_stream = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 5) \
    .load() \
    .selectExpr("timestamp AS event_time") \
    .withColumn("transaction_id", generate_transaction_id_udf()) \
    .withColumn("status", generate_transaction_status_udf())

# Aplicamos watermarks
transactions_stream = transactions_stream.withWatermark("event_time", "10 seconds")
status_stream = status_stream.withWatermark("event_time", "10 seconds")

# Realizamos un INNER JOIN por transaction_id
joined_stream = transactions_stream.alias("t").join(
    status_stream.alias("s"),
    (col("t.transaction_id") == col("s.transaction_id")) &
    (col("t.event_time").between(col("s.event_time") - expr("INTERVAL 5 SECONDS"),
                                 col("s.event_time") + expr("INTERVAL 5 SECONDS"))),
    "inner"
).select(
    col("t.user_id"),
    col("t.transaction_id"),
    col("s.status"),
    col("t.event_time").alias("transaction_time"),
    col("s.event_time").alias("status_update_time")
)

display(joined_stream)


user_id,transaction_id,status,transaction_time,status_update_time
user_002,987,Rejected,2025-03-24T19:09:58.909Z,2025-03-24T19:10:03.708Z
user_001,987,Rejected,2025-03-24T19:10:03.709Z,2025-03-24T19:10:03.708Z
user_001,987,Rejected,2025-03-24T19:09:59.309Z,2025-03-24T19:10:03.708Z
user_005,987,Rejected,2025-03-24T19:10:05.709Z,2025-03-24T19:10:03.708Z
user_005,987,Rejected,2025-03-24T19:10:01.109Z,2025-03-24T19:10:03.708Z
user_003,987,Rejected,2025-03-24T19:10:07.509Z,2025-03-24T19:10:03.708Z
user_001,987,Rejected,2025-03-24T19:10:04.709Z,2025-03-24T19:10:03.708Z
user_003,987,Rejected,2025-03-24T19:10:04.909Z,2025-03-24T19:10:03.708Z
user_005,987,Rejected,2025-03-24T19:10:08.109Z,2025-03-24T19:10:03.708Z
user_001,987,Approved,2025-03-24T19:10:19.509Z,2025-03-24T19:10:19.708Z
