In [None]:
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime
import json

KAFKA_TOPIC = "uk-flood"
BOOTSTRAP_SERVER = "localhost:9092"

ssc = StreamingContext(sc,1) #stream each one second
ssc.checkpoint("./checkpoint")
lines = KafkaUtils.createDirectStream(ssc, [KAFKA_TOPIC],
                                      {"metadata.broker.list": BOOTSTRAP_SERVER})

# Function to calculate the severity of each area
def river_highest_frequency_severe_flood(lines, window_length = 10, sliding_interval = 10):    
    def countAllRiverFrequencies(rdd):
        items = rdd.collect()
            
        def getAllRiverNames(item):
            listOfRiverSeas = item['floodArea']['riverOrSea'].split(', ')
            return listOfRiverSeas
            
        if(len(items) > 0):
            all_frequency_highest_severity = {}
            for item in items:
                riverAndSeas = getAllRiverNames(item)
                for name in riverAndSeas:
                    if not (name in all_frequency_highest_severity):
                        all_frequency_highest_severity[name] = [
                            {
                                'severityLevel': 1,
                                'frequency': 0
                            },
                            {
                                'severityLevel': 2,
                                'frequency': 0
                            },
                            {
                                'severityLevel': 3,
                                'frequency': 0
                            },
                            {
                                'severityLevel': 4,
                                'frequency': 0
                            }
                        ]
                    else:
                        all_frequency_highest_severity[name][item['severityLevel'] - 1]['frequency'] += 1
            result = []
            for name in all_frequency_highest_severity:
                for severity in all_frequency_highest_severity[name]:
                    result.append({"flood_warning_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "riverOrSeaName": name, "severity_level": severity['severityLevel'], "frequency": severity['frequency']})
        else:
            result = []
         
        return rdd.context.parallelize(result)
    
    data = lines.map(lambda line: json.loads(line[1])).flatMap(lambda items: items)
    windowed_data = data.window(window_length, sliding_interval)
    result = windowed_data.transform(countAllRiverFrequencies)

    return result

# run the function
result = river_highest_frequency_severe_flood(lines, window_length=10, sliding_interval=10)
# Print
result.pprint()