In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType

schema = StructType() \
    .add("order_id", StringType()) \
    .add("customer_id", StringType()) \
    .add("product", StringType()) \
    .add("quantity", IntegerType()) \
    .add("region", StringType())

initial_data = [
    ("1", "C101", "Laptop", 2, "South"),
    ("2", "C102", "Chair", 6, "North"),
    ("3", "C103", "Mobile", 1, "East")
]

df = spark.createDataFrame(initial_data, schema)

df.write \
  .mode("overwrite") \
  .option("header", True) \
  .csv("dbfs:/tmp/stream/orders")

In [0]:
order_stream = (spark.readStream.option("header", True).schema(schema).csv("dbfs:/tmp/stream/orders"))

In [0]:
from pyspark.sql.functions import when
transformed_orders = order_stream.withColumn("bulk_order", when(order_stream["quantity"] > 5, True).otherwise(False))
display(transformed_orders)

order_id,customer_id,product,quantity,region,bulk_order
1,C101,Laptop,2,South,False
2,C102,Chair,6,North,True
3,C103,Mobile,1,East,False


In [0]:
# Same rate stream and transformation
from pyspark.sql.functions import col
rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

transformed_df = rate_df.withColumn("is_even", (col("value") % 2 == 0))

# Write to memory (temp table)
query = (
    transformed_df.writeStream
    .format("memory")  # 👈 this is key
    .queryName("rate_table")  # 👈 table name to query later
    .outputMode("append")
    .start()
)

In [0]:
spark.sql("SELECT * FROM rate_table").show()

+--------------------+-----+-------+
|           timestamp|value|is_even|
+--------------------+-----+-------+
|2025-08-08 11:08:...|    0|   true|
|2025-08-08 11:08:...|    1|  false|
|2025-08-08 11:08:...|    2|   true|
|2025-08-08 11:08:...|    3|  false|
|2025-08-08 11:08:...|    4|   true|
|2025-08-08 11:08:...|    5|  false|
|2025-08-08 11:08:...|    6|   true|
|2025-08-08 11:08:...|    7|  false|
|2025-08-08 11:08:...|    8|   true|
|2025-08-08 11:08:...|    9|  false|
|2025-08-08 11:08:...|   10|   true|
|2025-08-08 11:08:...|   11|  false|
|2025-08-08 11:08:...|   12|   true|
|2025-08-08 11:08:...|   13|  false|
|2025-08-08 11:08:...|   14|   true|
|2025-08-08 11:08:...|   15|  false|
|2025-08-08 11:08:...|   16|   true|
|2025-08-08 11:08:...|   17|  false|
|2025-08-08 11:08:...|   18|   true|
|2025-08-08 11:08:...|   19|  false|
+--------------------+-----+-------+
only showing top 20 rows


In [0]:
from pyspark.sql.functions import col, current_timestamp

rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

# Variation
transformed_df = rate_df.withColumn("processing_time", current_timestamp())

# Write to memory (temp table)
query = (
    transformed_df.writeStream
    .format("memory")
    .queryName("rate_table")
    .outputMode("append")
    .start()
)




In [0]:
spark.sql("SELECT * FROM rate_table").show()

+--------------------+-----+--------------------+
|           timestamp|value|     processing_time|
+--------------------+-----+--------------------+
|2025-08-08 11:23:...|    0|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    1|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    2|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    3|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    4|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    5|2025-08-08 11:23:...|
|2025-08-08 11:23:...|    6|2025-08-08 11:23:...|
+--------------------+-----+--------------------+

