This notebook will create a simulated data and push to Kafka queue named "rtm_read_demo" in realtime mode

In [0]:
%pip install kafka-python 

In [0]:
kafka_user = dbutils.secrets.get(scope="rtm_demo", key="kafka_user")
kafka_password = dbutils.secrets.get(scope="rtm_demo", key="kafka_password")
kafka_topic=dbutils.secrets.get(scope="rtm_demo", key="topic")
kafka_bootstrap_servers=dbutils.secrets.get(scope="rtm_demo", key="kafka.bootstrap.servers")

jaas_config = (
    'kafkashaded.org.apache.kafka.common.security.scram.ScramLoginModule required '
    f'username="{kafka_user}" '
    f'password="{kafka_password}";'
)



In [0]:
# ============================================================
# 0. IMPORTS
# ============================================================
from pyspark.sql.functions import col, rand, round as spark_round, expr, current_timestamp, to_json, struct
import uuid as python_uuid
import random


# Make sure kafka-python is installed on the cluster. [web:26][web:18]

# ============================================================
# 1. NOTEBOOK PARAMETERS (DATABRICKS WIDGETS)
# ============================================================
# Get Kafka connection details from secrets (defined in Cell 3)
kafka_bootstrap_servers = dbutils.secrets.get(scope="rtm_demo", key="kafka.bootstrap.servers")
kafka_topic = dbutils.secrets.get(scope="rtm_demo", key="topic")

# Create these once in a separate cell, then re-use:
# dbutils.widgets.text("kafka_bootstrap", "broker1:9092", "Kafka bootstrap servers")
# dbutils.widgets.text("kafka_topic", "customer_txn_topic", "Kafka topic")
dbutils.widgets.text("kafka_partitions", "1", "Kafka partitions")
dbutils.widgets.text("rows_per_second", "10", "Rows per second")
dbutils.widgets.dropdown("duplication_mode", "none", ["none", "percentage"], "Duplication mode")
dbutils.widgets.text("duplication_value", "0.0", "Duplication value")

#kafka_bootstrap   = dbutils.widgets.get("kafka_bootstrap")
#kafka_topic       = dbutils.widgets.get("kafka_topic")
kafka_partitions  = int(dbutils.widgets.get("kafka_partitions"))
rows_per_second   = int(dbutils.widgets.get("rows_per_second"))
duplication_mode  = dbutils.widgets.get("duplication_mode")      # "none" | "percentage" | "multiplier"
duplication_value = dbutils.widgets.get("duplication_value")

if duplication_mode == "percentage":
    duplication_percentage = float(duplication_value)   # e.g. 0.30
else:
    duplication_percentage = 0.0




In [0]:
from kafka.admin import KafkaAdminClient, NewTopic
from kafka.errors import UnknownTopicOrPartitionError, NoBrokersAvailable
try:
    admin_client = KafkaAdminClient(
        bootstrap_servers= kafka_bootstrap_servers,  
        client_id="databricks-admin-client",
        security_protocol="SASL_SSL",
        sasl_mechanism="SCRAM-SHA-256",
        sasl_plain_username=kafka_user,
        sasl_plain_password=kafka_password
    )
except NoBrokersAvailable as e:
    raise RuntimeError(f"Cannot connect to Redpanda at {kafka_bootstrap_servers}: {e}")

# Delete topic if it exists
try:
    admin_client.delete_topics(
        topics=[kafka_topic],
        timeout_ms=30000
    )
    print(f"Requested deletion of topic: {kafka_topic}")
except UnknownTopicOrPartitionError:
    print(f"Topic {kafka_topic} did not exist, skipping delete.")
except Exception as e:
    print(f"Delete topics error for {kafka_topic}: {e}")

# Create topic
new_topic = NewTopic(
    name=kafka_topic,
    num_partitions=kafka_partitions,
    replication_factor=3
)

try:
    admin_client.create_topics(
        new_topics=[new_topic],
        validate_only=False
    )
    print(f"Topic {kafka_topic} created with {kafka_partitions} partitions.")
except Exception as e:
    print(f"Create topic error for {kafka_topic}: {e}")
finally:
    admin_client.close()

In [0]:
from pyspark.sql.functions import (
    col, expr, current_timestamp, to_json, struct, lit
)
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
import uuid as python_uuid
import random

# ============================================================
# CONFIG
# ============================================================
print("Config values:")
print(f"  duplication_mode: {duplication_mode}")           # "none" or "duplicate"
print(f"  duplication_value: {duplication_value}")
print(f"  rows_per_second: {rows_per_second}")
print(f"  kafka_partitions: {kafka_partitions}")
print()

# ============================================================
# 1. PRECOMPUTE STATIC EVENT POOL (BATCH)
# ============================================================

num_events = 1000

random.seed(42)

events = []
for i in range(num_events):
    cid = str(python_uuid.uuid4())
    amount = round(random.random() * 180 + 20, 2)  # 20–200
    base_std = random.random() * 25 + 5
    daily_avg = round(amount + (base_std if random.random() > 0.5 else -base_std), 2)
    events.append((i, cid, amount, daily_avg))

events_schema = StructType([
    StructField("event_index",   IntegerType(), False),
    StructField("customerID",    StringType(), False),
    StructField("amount",        DoubleType(), False),
    StructField("daily_average", DoubleType(), False),
])

events_dim = spark.createDataFrame(events, schema=events_schema).cache()

print(f"Precomputed event pool size: {events_dim.count()}")

# ============================================================
# 2. RATE STREAM
# ============================================================

rate_df = (
    spark.readStream
    .format("rate")
    .option("numPartitions", kafka_partitions)
    .option("rowsPerSecond", rows_per_second)
    .load()
)

# ============================================================
# 3. MAP RATE VALUES TO EVENT INDICES
# ============================================================

base_idx_df = (
    rate_df
    .withColumn("event_index", (col("value") % num_events).cast("int"))
)

if duplication_mode == "duplicate":
    dup_idx_df = (
        base_idx_df
        .select(
            col("timestamp").alias("timestamp_dup"),
            col("event_index")
        )
        .withColumn("dup_flag", lit(True))
    )

    base_idx_df = base_idx_df.withColumn("dup_flag", lit(False))

    idx_union_df = (
        base_idx_df
        .select(col("timestamp"), col("event_index"), col("dup_flag"))
        .unionByName(
            dup_idx_df.select(
                col("timestamp_dup").alias("timestamp"),
                col("event_index"),
                col("dup_flag")
            )
        )
    )
else:
    idx_union_df = base_idx_df.withColumn("dup_flag", lit(False))

# ============================================================
# 4. JOIN WITH STATIC EVENTS TO GET FIELDS
# ============================================================

txn_df = (
    idx_union_df
    .join(events_dim, on="event_index", how="inner")
    .withColumn("timestamp", current_timestamp())
    .select("customerID", "timestamp", "amount", "daily_average", "dup_flag")
)

# ============================================================
# 5. WRITE TO KAFKA
# ============================================================

kafka_df = (
    txn_df
    .select(
        col("customerID").cast("string").alias("key"),
        to_json(
            struct(
                col("customerID"),
                col("timestamp"),
                col("amount"),
                col("daily_average"),
                col("dup_flag")
            )
        ).alias("value")
    )
)

query = (
    kafka_df
    .writeStream
    .format("kafka")
    .queryName("customer_txn_kafka_dim_events")
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers)
    .option("topic", kafka_topic)
    .option("kafka.security.protocol", "SASL_SSL")
    .option("kafka.sasl.mechanism", "SCRAM-SHA-256")
    .option("kafka.sasl.jaas.config", jaas_config)
    .option("kafka.ssl.endpoint.identification.algorithm", "https")
    .option("checkpointLocation", "/tmp/checkpoints/customer_txn_kafka_dim_events_a")
    .outputMode("append")
    .start()
)

print(f"Streaming query started to topic: {kafka_topic}")
print(f"Query ID: {query.id}")


In [0]:
#display(duplicates_df)