# Notebook Running Kafka

## Random Weather Generation

In [1]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [2]:
# loops forever because the weather never ends...
#for row in all_stations(3):
    #print(row) # date, station, temp, raining

## Kafka Client

In [3]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)
admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

deleted


['stations-json', 'stations', '__consumer_offsets']

Building proto file

In [4]:
! python3 -m grpc_tools.protoc -I=. --python_out=. stations.proto

## Kafka Producer

In [5]:
# thread-safe print
from threading import Thread, Lock
import json

lock = Lock()
def Print(*args):
    with lock:
        print(*args)

In [6]:
from stations_pb2 import *

In [7]:
def produce():
    Print("sending station data to topics...")
    #gaurantee us at-least-once semantics:
    #we set acks to all, which sends back ack only if all data is safely commited aka all in-sync replicas have received it
    #we set retries to 10 so we keep retrying until we get a strong ack back
    #we write code in consumer to improve and get desired exactly-one-semantic
    producer = KafkaProducer(bootstrap_servers=["kafka:9092"], acks= "all", retries=10)
    
    for date, station, degrees, raining in all_stations(15):
        key = bytes(station, "utf-8")
        
        #sending to stations topic
        rep = Report(date = date, station = station, degrees = degrees, raining = raining)
        value = rep.SerializeToString()
        producer.send("stations", value=value, key=key)
        
        #sending to stations-json topic
        rainingBinary = 0
        if raining == True:
            rainingBinary = 1
        else:
            rainingBinary = 0
            
        value = {"date": date, "station": station, "degrees": degrees, "raining": rainingBinary}
        value = bytes(json.dumps(value), "utf-8")
        producer.send("stations-json", value=value, key=key)
        
#start thread to run produce
Thread(target = produce).start()

sending station data to topics...


## Kafka Consumer

In [8]:
import os, json

#Delete json files if already exist
for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [9]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"partition": partition_num, "offset": 0}

def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [10]:
#let the producer produce some reports
time.sleep(90)

In [11]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(bootstrap_servers=["kafka:9092"])
    #list of TopicPartition objects
    topicParts = [TopicPartition("stations", num) for num in part_nums]
    consumer.assign(topicParts)

    # PART 1: initialization
    partitions = {} # key=partition num, value=snapshot dict
    #load partitions from JSON files (if they exist) or create fresh dicts
    #if offsets were specified in previous JSON files, the consumer
    #should seek to those; else, seek to offset 0.
    for i in range(len(part_nums)):
        partitions[part_nums[i]] = load_partition(part_nums[i])
        if "offset" in partitions[part_nums[i]]:
            consumer.seek(topicParts[i], partitions[part_nums[i]]["offset"])
        else:
            consumer.seek(topicParts[i], 0)
    

    # PART 2: process batches
    for i in range(iterations):
        batch = consumer.poll(1000) # 1s timeout
        for topic, messages in batch.items():
            #update the partitions based on new messages
            #save the data back to the JSON file
            pnum = topic.partition #getting the partition number
            snapshot = partitions[pnum]
            for msg in messages:
                #deserializing:
                s = Report.FromString(msg.value)
                
                #case when station in snapshot
                if s.station in snapshot:
                    if s.date > snapshot[s.station]["end"]:
                        snapshot[s.station]["count"] += 1                        #incrementing count
                        snapshot[s.station]["sum"] += s.degrees                  #updating sum of temps
                        snapshot[s.station]["end"] = s.date                      #updating end date
                        snapshot[s.station]["avg"] = snapshot[s.station]["sum"]/snapshot[s.station]["count"] #updating avg
                        #snapshot["offset"] += 1
                 
                #case when it is not
                else:
                    snapshot[s.station] = {}
                    snapshot[s.station]["count"] = 1
                    snapshot[s.station]["sum"] = s.degrees
                    snapshot[s.station]["avg"] = s.degrees
                    snapshot[s.station]["start"] = s.date
                    snapshot[s.station]["end"] = s.date
                    snapshot[s.station] = dict(sorted(snapshot[s.station].items())) #ordering
                    #snapshot["offset"] += 1
            
            snapshot["offset"] = consumer.position(TopicPartition("stations", pnum)) #setting appropriate offset 
            partitions[pnum] = dict(sorted(partitions[pnum].items())) #ordering
    
    #dumping data to json files
    for p in partitions:
        path = f"partition-{p}.json"
        with open(path, "w") as file:
            json.dump(partitions[p], file)
    print("exiting")

#starting consumer threads
for i in range(2):
    print("ROUND", i)
    t1 = Thread(target=consume, args=([0,1], 30))
    t2 = Thread(target=consume, args=([2,3], 30))
    t3 = Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()

ROUND 0
exiting
exiting
exiting
ROUND 1
exiting
exiting
exiting


In [12]:
!cat partition*.json

{"N": {"avg": 45.73553359018883, "count": 139, "end": "2000-05-18", "start": "2000-01-01", "sum": 6357.239169036247}, "offset": 139, "partition": 0}{"E": {"avg": 46.032538477510876, "count": 139, "end": "2000-05-18", "start": "2000-01-01", "sum": 6398.522848374012}, "O": {"avg": 29.683402734456237, "count": 139, "end": "2000-05-18", "start": "2000-01-01", "sum": 4125.992980089417}, "offset": 278, "partition": 1}{"F": {"avg": 38.37035950738978, "count": 136, "end": "2000-05-15", "start": "2000-01-01", "sum": 5218.36889300501}, "I": {"avg": 55.505637464421795, "count": 136, "end": "2000-05-15", "start": "2000-01-01", "sum": 7548.766695161364}, "J": {"avg": 36.096625784804075, "count": 136, "end": "2000-05-15", "start": "2000-01-01", "sum": 4909.141106733354}, "offset": 408, "partition": 2}{"D": {"avg": 47.63517607643261, "count": 136, "end": "2000-05-15", "start": "2000-01-01", "sum": 6478.383946394834}, "G": {"avg": 43.2117556306084, "count": 136, "end": "2000-05-15", "start": "2000-01-