# Lab 8: Exercises for Spark Streaming

In [None]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
import pyspark.sql.functions as F
from pyspark import SparkConf

import os 
os.environ['PYSPARK_PYTHON'] = "/root/anaconda3/bin/python"

# spark.stop()  
## pip install kafka-python

spark = SparkSession.builder\
  .config(
    "spark.jars", 
    "/shareddata/lab08/kafka-clients-3.5.0.jar,/shareddata/lab08/spark-sql-kafka-0-10_2.12-3.5.0.jar, \
    /shareddata/lab08/spark-token-provider-kafka-0-10_2.12-3.5.0.jar, \
    /shareddata/lab08/commons-pool2-2.12.0.jar") \
  .config("spark.executorEnv.PYSPARK_PYTHON","/root/anaconda3/bin/python") \
  .config("spark.executor.memory", "2g") \
  .config("spark.driver.memory", "2g") \
  .config("spark.log.level", "WARN") \
  .appName("lab8_exercise").getOrCreate()


## Part A: Basic Spark Streaming Operations

### (A-1): define data source

Load streaming data from json files. 

In [None]:
static = spark.read.json("/shareddata/data/activity-data/")
dataSchema = static.schema
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1).json("/shareddata/data/activity-data")
streaming.printSchema()

### (A-2): Process data
You can also try other operations on the data like filtering, etc.

In [None]:
activityCounts = streaming.groupBy("gt").count()

### (A-3): Define data sink and trigger, start the streaming query

In [None]:
# activityQuery = activityCounts.writeStream.queryName("activity_counts").format("console").outputMode("update").trigger(processingTime="2 seconds").start()
activityQuery = activityCounts.writeStream.queryName("activity_counts").format("memory").outputMode("update").trigger(processingTime="2 seconds").start()

### (A-4) Display data from the Table of query_name `activity_counts`

In [None]:
from time import sleep
for x in range(3):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(2)

### (A-5) Stop the streaming query

In [None]:
# activityQuery.stop()
activityQuery.awaitTermination(10)

## Part B: another example for Spark Streaming

### Example 1: Simple Transform

In [None]:

simpleTransform = streaming.withColumn("stairs", F.expr("gt like '%stairs%'"))\
  .where("stairs").where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")

simpleTransform = simpleTransform.writeStream.queryName("simple_transform").format("memory").outputMode("append")

simpleTransform = simpleTransform.trigger(processingTime="2 seconds").start()

In [None]:
spark.sql("SELECT * FROM simple_transform").show()

In [None]:
simpleTransform.stop()

### Example 2: Aggregate data 

In [None]:

deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
  .drop("avg(Index)")\
  .writeStream.queryName("device_counts").format("memory")\
  .outputMode("complete")\
  .start()


In [None]:
spark.sql("SELECT * FROM device_counts").show()

In [None]:
deviceModelStats.stop()

### Example 3: Use socket source 

open a tmux window, then run the following command:

* First install netcat package by `apt install netcat`
* check the installation by `nc -h`
* start a socket by `nc -lk 9999`

Note that most ports are disabled. 

In [None]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark.readStream.format("socket") \
    .option("host", "localhost").option("port", 8888).load()

# Split the lines into words
words = lines.select(
   F.explode(
       F.split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()


In [None]:
query = wordCounts \
    .writeStream \
    .format("console") \
    .outputMode("complete") \
    .start()
# query.awaitTermination()
# .queryName("word_count2") \

In [None]:
# spark.sql("SELECT * FROM word_count").show(5)

In [None]:
query.awaitTermination()

## Part C:  Example to use Kafka 



### How to use Kafka in Spark

[How to install python packages in spark driver and executor](https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html)

For example, if we use the `archives` method,

* first install conda-pack by `conda install conda-pack`
* then generate the archived conda env by `conda pack -f -o pyspark_conda_env.tar.gz --ignore-missing-files`
* Run pyspark codes by `$sparkloc/bin/spark-submit --jars /data/jupyter-data/lab08/spark-sql-kafka-0-10_2.12-3.3.1.jar,/data/jupyter-data/lab08/spark-token-provider-kafka-0-10_2.12-3.3.1.jar,/data/jupyter-data/lab08/commons-pool2-2.11.1.jar --archives /data/jupyter-data/lab08/pyspark_conda_env.tar.gz /data/jupyter-data/lab08_spark_kafka.py`

### Run kafka producer

See `lab08_kafka.sh`. 

### Define streaming source and sink

In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
import pyspark.sql.functions as F
from pyspark import SparkConf

import os 
os.environ['PYSPARK_PYTHON'] = "/root/miniconda3/envs/sp/bin/python"

# spark.stop()

spark = SparkSession.builder\
  .config(
    "spark.jars", 
    "/shareddata/lab08/kafka-clients-3.5.0.jar,/shareddata/lab08/spark-sql-kafka-0-10_2.12-3.5.0.jar, \
    /shareddata/lab08/spark-token-provider-kafka-0-10_2.12-3.5.0.jar, \
    /shareddata/lab08/commons-pool2-2.12.0.jar") \
  .config("spark.executorEnv.PYSPARK_PYTHON","/root/anaconda3/bin/python") \
  .config("spark.executor.memory", "2g") \
  .config("spark.driver.memory", "2g") \
  .config("spark.log.level", "WARN") \
  .appName("lab8_exercise").getOrCreate()

24/04/07 03:11:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Setting Spark log level to "WARN".


In [3]:
# Subscribe to 1 topic
df1 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("subscribe", "lab08") \
  .option("startingOffsets", "earliest") \
  .load()


In [4]:

df1.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream.queryName("kafka_transform").format("memory").outputMode('update').trigger(processingTime="5 seconds").start()

24/04/07 03:12:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-732a0e7d-25d3-4704-a0f2-a73082d7e6fa. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/04/07 03:12:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f34c12ba5d0>

                                                                                

In [5]:
spark.sql("SELECT get_json_object(f.value, '$.Index') as idx,get_json_object(f.value, '$.Device') as Device FROM kafka_transform f ").count()

47659

In [6]:
spark.sql("SELECT get_json_object(f.value, '$.Index') as idx,get_json_object(f.value, '$.Device') as Device FROM kafka_transform f ").tail(5)

[Row(idx='3971', Device='nexus4_1'),
 Row(idx='4011', Device='nexus4_1'),
 Row(idx='4051', Device='nexus4_1'),
 Row(idx='4091', Device='nexus4_1'),
 Row(idx='3954', Device='nexus4_2')]

24/04/07 03:18:21 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (4aea190b2e55/172.17.0.3:9092) could not be established. Broker may not be available.
24/04/07 03:18:21 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (4aea190b2e55/172.17.0.3:9092) could not be established. Broker may not be available.
24/04/07 03:18:21 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (4aea190b2e55/172.17.0.3:9092) could not be established. Broker may not be available.
24/04/07 03:18:22 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (4aea190b2e55/172.17.0.3:9092) could not be established. Broker may not be available.
24/04/07 03:18:22 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (4aea190b2e55/172.17.0.3:9092) could not be established. Broker may not be available.
24/04/07 03:18:23 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to n

# END


Thank you 