In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler
import random
import logging
import time
import json
from datetime import datetime

from pymongo import MongoClient
client = MongoClient('localhost', 27017)

db = client.ICS5114
collection = db.glacial_collection

kafka_topic_name = "ICS5114"
kafka_bootstrap_servers = 'localhost:9092'

total_docs = 0
current_total_dh = 0
average_dh = 0

spark = SparkSession \
        .builder \
        .appName("Structured Streaming ") \
        .master("local[*]") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# streaming df, reading from ICS5114 topic
kafka_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("failOnDataLoss", "false") \
        .option("subscribe", kafka_topic_name) \
        .option("startingOffsets", "latest") \
        .option("spark.streaming.kafka.maxRatePerPartition", "10") \
        .load()

def data_process(df, batch_id):
    global average_dh, total_docs, current_total_dh
    print("processing....")
    kafka_req = df.rdd.map(lambda x: x.value).collect()
    for record in kafka_req:
        new_json_record = json.loads(record.decode("utf-8"))
        collection.insert_one(new_json_record)  
        total_docs = total_docs + 1
        current_total_dh = current_total_dh + new_json_record["dh"]
        average_dh = current_total_dh/total_docs
    print("current elevation change average : " + str(average_dh) )


k_df = kafka_df.writeStream \
    .foreachBatch(data_process) \
    .trigger(processingTime="5 seconds") \
    .start() \
    .awaitTermination()

  
# foreach method is not supported in python, the forEachBatch method was used instead --
# def row_process(row):
#     print(row)
    
# k_w = (kafka_df.writeStream \
#     .foreach(row_process) \
#     .trigger(processingTime="1 second") \
#     .start() \
#     .awaitTermination())
# ---------------------------------------------------------------------------------------

processing....
current elevation change average : 0
processing....
current elevation change average : -4.0470805691978144
processing....
current elevation change average : -4.641864895863084
