In [45]:
from pyspark.sql import SparkSession
from IPython.display import display, clear_output
import time
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [62]:
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [78]:
stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "stock-trades") \
  .load()

In [79]:
stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [80]:
raw_stream = stream_df \
    .writeStream \
    .format("memory") \
    .queryName("raw_stocktrade_view") \
    .start()

In [81]:
clear_output(wait=True)

In [82]:
time.sleep(10)

In [83]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM raw_stocktrade_view').show(20))
time.sleep(1)

+----------------+--------------------+------------+---------+------+--------------------+-------------+
|             key|               value|       topic|partition|offset|           timestamp|timestampType|
+----------------+--------------------+------------+---------+------+--------------------+-------------+
|[5A 57 5A 5A 54]|[7B 22 73 63 68 6...|stock-trades|        0|     0|2021-06-05 07:40:...|            0|
|[5A 58 5A 5A 54]|[7B 22 73 63 68 6...|stock-trades|        0|     1|2021-06-05 07:40:...|            0|
|[5A 56 5A 5A 54]|[7B 22 73 63 68 6...|stock-trades|        0|     2|2021-06-05 07:40:...|            0|
|[5A 56 5A 5A 54]|[7B 22 73 63 68 6...|stock-trades|        0|     3|2021-06-05 07:40:...|            0|
|[5A 58 5A 5A 54]|[7B 22 73 63 68 6...|stock-trades|        0|     4|2021-06-05 07:40:...|            0|
|[5A 54 45 53 54]|[7B 22 73 63 68 6...|stock-trades|        0|     5|2021-06-05 07:40:...|            0|
|   [5A 42 5A 58]|[7B 22 73 63 68 6...|stock-trades|   

None

In [77]:
raw_stream.stop()

In [85]:
string_stream_df = stream_df \
    .withColumn("key", stream_df["key"].cast(StringType())) \
    .withColumn("value", stream_df["value"].cast(StringType()))

In [86]:
string_stream = string_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("string_stocktrade_view") \
    .start()

In [107]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM string_stocktrade_view').show(20, False))
time.sleep(1)

+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------

None

In [108]:
string_stream.stop()

In [157]:
schema_stocktrade = StructType([
    StructField('payload', StructType([
        StructField("side", StringType(),  True),
        StructField("quantity", IntegerType(), True),
        StructField("symbol", StringType(),  True),
        StructField("price", IntegerType(),  True),
        StructField("account", StringType(),  True),
        StructField("userid", StringType(),  True)      
    ]))
])

In [158]:
json_stream_df = string_stream_df\
    .withColumn("value", F.from_json("value", schema_stocktrade))

In [159]:
json_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- payload: struct (nullable = true)
 |    |    |-- side: string (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |    |    |-- symbol: string (nullable = true)
 |    |    |-- price: integer (nullable = true)
 |    |    |-- account: string (nullable = true)
 |    |    |-- userid: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [186]:
json_stream = json_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("extract_stock_view") \
    .start()

IllegalArgumentException: Cannot start query with name extract_stock_view as a query with that name is already active in this SparkSession

In [187]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM extract_stock_view').show(20))
time.sleep(1)

+-----+--------------------+------------+---------+------+--------------------+-------------+
|  key|               value|       topic|partition|offset|           timestamp|timestampType|
+-----+--------------------+------------+---------+------+--------------------+-------------+
|ZWZZT|{{BUY, 1769, ZWZZ...|stock-trades|        0|     0|2021-06-05 07:40:...|            0|
|ZXZZT|{{SELL, 998, ZXZZ...|stock-trades|        0|     1|2021-06-05 07:40:...|            0|
|ZVZZT|{{SELL, 4974, ZVZ...|stock-trades|        0|     2|2021-06-05 07:40:...|            0|
|ZVZZT|{{BUY, 1143, ZVZZ...|stock-trades|        0|     3|2021-06-05 07:40:...|            0|
|ZXZZT|{{SELL, 6, ZXZZT,...|stock-trades|        0|     4|2021-06-05 07:40:...|            0|
|ZTEST|{{BUY, 1014, ZTES...|stock-trades|        0|     5|2021-06-05 07:40:...|            0|
| ZBZX|{{SELL, 2883, ZBZ...|stock-trades|        0|     6|2021-06-05 07:40:...|            0|
|  ZVV|{{BUY, 104, ZVV, ...|stock-trades|        0|     7|20

None

In [194]:
json_stream.stop()

In [200]:
stocktrade_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.payload.account", \
        "value.payload.symbol", \
        "value.payload.side", \
        "value.payload.price", \
        "value.payload.quantity", \
        "value.payload.userid"
    )

In [201]:
stocktrade_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- account: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- side: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- userid: string (nullable = true)



In [205]:
stocktrade_stream = stocktrade_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("stocktrade_view") \
    .start()

In [204]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM stocktrade_view').show(20))
time.sleep(1)

+---------+------------+--------------------+-------+------+----+-----+--------+------+
|event_key| event_topic|     event_timestamp|account|symbol|side|price|quantity|userid|
+---------+------------+--------------------+-------+------+----+-----+--------+------+
|    ZWZZT|stock-trades|2021-06-05 07:40:...| XYZ789| ZWZZT| BUY|  161|    1769|User_4|
|    ZXZZT|stock-trades|2021-06-05 07:40:...| LMN456| ZXZZT|SELL|  327|     998|User_6|
|    ZVZZT|stock-trades|2021-06-05 07:40:...| ABC123| ZVZZT|SELL|  414|    4974|User_1|
|    ZVZZT|stock-trades|2021-06-05 07:40:...| XYZ789| ZVZZT| BUY|  572|    1143|User_3|
|    ZXZZT|stock-trades|2021-06-05 07:40:...| LMN456| ZXZZT|SELL|  785|       6|User_4|
|    ZTEST|stock-trades|2021-06-05 07:40:...| ABC123| ZTEST| BUY|  389|    1014|User_2|
|     ZBZX|stock-trades|2021-06-05 07:40:...| ABC123|  ZBZX|SELL|  128|    2883|User_7|
|      ZVV|stock-trades|2021-06-05 07:40:...| ABC123|   ZVV| BUY|  976|     104|User_4|
|    ZVZZT|stock-trades|2021-06-

None

In [207]:
clear_output(wait=True)
display(spark.sql('SELECT event_key, COUNT(1) AS count, round(mean(price),0) as price, round(mean(quantity),0) as qty FROM stocktrade_view GROUP BY 1').show(20))
time.sleep(1)

+---------+-----+-----+------+
|event_key|count|price|   qty|
+---------+-----+-----+------+
|    ZXZZT|22195|503.0|2506.0|
|    ZVZZT|22120|503.0|2506.0|
|    ZTEST|22172|500.0|2488.0|
|    ZWZZT|22217|503.0|2494.0|
|    ZJZZT|22123|504.0|2501.0|
|      ZVV|22354|502.0|2487.0|
|     ZBZX|22194|504.0|2496.0|
+---------+-----+-----+------+



None

In [203]:
stocktrade_stream.stop()

In [180]:
window_duration = '60 seconds'
slide_duration = '10 seconds'

In [183]:
windowed_count_df = stocktrade_stream_df \
    .withWatermark("event_timestamp", "1 minutes") \
    .groupBy(F.window(stocktrade_stream_df.event_timestamp, window_duration, slide_duration), stocktrade_stream_df.symbol) \
    .count()

In [184]:
count_stream = windowed_count_df \
    .writeStream \
    .format("memory") \
    .outputMode("Complete") \
    .queryName("count_view") \
    .start()

In [185]:
while True:
    clear_output(wait=True)
    display(spark.sql('SELECT * FROM count_view LIMIT 20').show())
    time.sleep(1)

KeyboardInterrupt: 

In [41]:
count_stream.stop()