# Week 5: Data Ingestion (Kafka)


![](https://camo.githubusercontent.com/56166d361c3975dee750ecce16d605bbbf66516b/68747470733a2f2f75706c6f61642e77696b696d656469612e6f72672f77696b6970656469612f636f6d6d6f6e732f352f35332f4170616368655f6b61666b615f776f7264747970652e737667)

### Student ID: [#####]
### Subtasks Done: [#,#,..]

# Working with sensor data

We want to monitor the status of three smart buildings.
Each building has 8 floors and each floor has 20 rooms, that have a max capacity of 10 people each.

Rooms are equipped with sensors that counts how many people are currently inside the rooms. 

Due to COVID-19, we want monitor how many people are in the various rooms, floors, and buildings.

![](./buildings.png)

In [12]:
!pip list

Package                            Version
---------------------------------- ----------------------
absl-py                            0.10.0
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
aniso8601                          7.0.0
AnyQt                              0.0.10
applaunchservices                  0.2.1
appnope                            0.1.0
appscript                          1.1.1
argh                               0.26.2
argon2-cffi                        20.1.0
arviz                              0.10.0
ase                                3.20.1
asn1crypto                         1.4.0
astroid                            2.4.2
astropy                            4.0.1.post1
asttokens                          2.0.3
astunparse                         1.6.3
atomicwrites                       1.4.0
attrs                              20.1.0


torch                              1.6.0
torch-cluster                      1.5.7
torch-geometric                    1.6.1
torch-scatter                      2.0.5
torch-sparse                       0.6.7
torch-spline-conv                  1.2.0
torchvision                        0.7.0
tornado                            6.0.4
tqdm                               4.48.2
traitlets                          5.0.4
tsmoothie                          0.1.7
typing-extensions                  3.7.4.3
ujson                              1.35
umap-learn                         0.4.6
unicodecsv                         0.14.1
uritemplate                        3.0.1
urllib3                            1.25.10
vyper                              0.2.7
watchdog                           0.10.3
wcwidth                            0.2.5
webencodings                       0.5.1
websocket-client                   0.57.0
Werkzeug                           1.0.1
wheel                      

# Notes before starting!!!
 
- you can create as many topics as you want
- each topic in the exercise should have **at least** 2 partitions. 
    - (HINT: to decide how many partition look at task 3)
- we assume a replication factor of 1 for all the topics is sufficient.

## Task 1: Counting People

Write a Kafka Producer that generates the observations every 5 seconds (system time)
for each building, floor, and room, and pushes them to a topic

In [1]:
from confluent_kafka import SerializingProducer, DeserializingConsumer
from confluent_kafka.serialization import StringSerializer, StringDeserializer
from confluent_kafka.serialization import DoubleSerializer, DoubleDeserializer 
from confluent_kafka.serialization import IntegerSerializer, IntegerDeserializer
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4
import sys, lorem, random, time, json

brokers = "kafka1:9092,kafka2:9093"
topics = ["rooms", "floors", "buildings"] ## Add here your topics

ImportError: dlopen(/Users/hyrundo/opt/anaconda3/lib/python3.8/site-packages/confluent_kafka/cimpl.cpython-38-darwin.so, 2): Symbol not found: _syslog$DARWIN_EXTSN
  Referenced from: /Users/hyrundo/opt/anaconda3/lib/python3.8/site-packages/confluent_kafka/.dylibs/librdkafka.1.dylib (which was built for Mac OS X 10.13)
  Expected in: /usr/lib/libSystem.B.dylib
 in /Users/hyrundo/opt/anaconda3/lib/python3.8/site-packages/confluent_kafka/.dylibs/librdkafka.1.dylib

In [2]:
new_topics = [NewTopic(topic, num_partitions=3, replication_factor=1) for topic in topics]

####  Create new topics

In [3]:
a = AdminClient({'bootstrap.servers': brokers})

In [4]:
a.create_topics(new_topics)

{'rooms': <Future at 0x7f7dcc3a2820 state=running>,
 'floors': <Future at 0x7f7dcc3a2910 state=running>,
 'buildings': <Future at 0x7f7dcc3a29a0 state=running>}

#### Populate the topics with 1000 observations

In [5]:
pconf = {
    'bootstrap.servers': brokers,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer':  StringSerializer()
}

In [6]:
p = SerializingProducer(pconf)

In [9]:
for n in range(1,1000):
    try:
        
        m = { 'count' : random.randint(0,9),
              'building' : random.randint(0,2),
              'floor': random.randint(0,7),
              'room': random.randint(0,19) }
       
        p.produce(topics[0], key=str(uuid4()), value=str(m))
        p.poll(0)
        p.flush()
        time.sleep(5)
    except BufferError:
        sys.stderr.write('%% Local producer queue is full (%d messages awaiting delivery): try again\n' % len(p))

## Task 2: Reading observations

Write a Kafka Consumer that reads the previous topic and prints the result out.

In [None]:
consumer_conf = {
    'bootstrap.servers': brokers,
    'key.deserializer': StringDeserializer('utf_8'),
    'value.deserializer': StringDeserializer(),
    'group.id': str(uuid4()),
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

In [None]:
consumer = DeserializingConsumer(consumer_conf)
consumer.subscribe([topics[0]])

In [None]:
try:
    while True:
            msg = consumer.poll(1.0)
            if msg is None:
                continue
            else:
                print(msg.key() +"->"+ str(msg.value()))
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

## Task 3: Total number of people in the last minute

Write a Kafka Consumer that reads the previous topics and count
the number of people per floor and per building every minute,

For the one minute window you can either implement a windowing mechanism like in the wordcount practice part 2 or 
use sleep(50) and restart the computation accordingly.

##### HINT: How did you organize the data in partitions?

#### Change the message key to simplify counting by floor.

In [None]:
## Your Consumer Code Here

In [67]:
consumer_conf = {
    'bootstrap.servers': brokers,
    'key.deserializer': StringDeserializer('utf_8'),
    'value.deserializer': StringDeserializer(),
    'group.id': str(uuid4()),
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

In [68]:
consumer = DeserializingConsumer(consumer_conf)
consumer.subscribe([topics[0]])

In [69]:
p = SerializingProducer(pconf)


In [None]:
try:
    while True:
            msg = consumer.poll(1.0)
            if msg is None:
                continue
            j = json.loads(msg.value().replace("\'", "\""))
            key = {}
            key['floor']=j['floor']
            key['building']=j['building']
            p.produce("floors1", key=str(key), value=msg.value())
            p.poll(0)
            p.flush()
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

### Total number of people Per Floor

In [None]:
## Your Consumer Code Here

In [None]:
consumer_conf = {
    'bootstrap.servers': brokers,
    'key.deserializer': StringDeserializer('utf_8'),
    'value.deserializer': IntegerDeserializer(),
    'group.id': str(uuid4()),
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

In [None]:
consumer = DeserializingConsumer(consumer_conf)
consumer.subscribe([topics[1]])
floors = {}

In [None]:
last = int(time.time()*1000)
time.sleep(4)
try:
    while True:
            msg = consumer.poll(1.0)
            if msg is None:
                continue
            if msg.key() not in floors:
                floors[msg.key()] = [msg.value()]
            else:
                floors[msg.key()].append(msg.value())
            print(int(time.time()*1000)-last)
            if(int(time.time()*1000)-last>5000):
                print("sleep")
                last = int(time.time()*1000)
                time.sleep(5)
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

In [None]:
floors_avg = {}
for (f) in floors:
    floors_avg[f]=sum(floors[f])

In [None]:
sorted(floors_avg.items())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.bar(floors_avg.keys(), floors_avg.values(), color='g')

### Average number of people Per Building

In [None]:
try:
    while True:
            msg = consumer.poll(1.0)
            if msg is None:
                continue
            key = msg.key().split(";")
            floor = key[0]+";"+key[1]
            building = key[0]
            print(building)
            print(floor)
            p.produce(topics[2], key=building, value=msg.value()) # building
            p.produce(topics[1], key=floor, value=msg.value()) #floor
            print(floor + " " + building + " " + str(msg.value()))
            p.poll(0)
            p.flush()
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

In [None]:
## Your Consumer Code Here

In [None]:
consumer_conf = {
    'bootstrap.servers': brokers,
    'key.deserializer': StringDeserializer('utf_8'),
    'value.deserializer': IntegerDeserializer(),
    'group.id': str(uuid4()),
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

In [None]:
consumer = DeserializingConsumer(consumer_conf)
consumer.subscribe([topics[2]])
buildings = {}
output = []

In [None]:
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt

In [None]:
last = int(time.time()*1000)
time.sleep(4)
try:
    while True:
            msg = consumer.poll(1.0)
            if msg is None:
                continue
            if msg.key() not in floors:
                buildings[msg.key()] = [msg.value()]
            else:
                buildings[msg.key()].append(msg.value())
            new = int(time.time()*1000)
            if(new-last>3000):
                plot = {}
                for b in buildings:
                    plot[b]=sum(buildings[b])
                output.append(plot)
                buildings={}
                last=new
                print("sleep")
                time.sleep(5)     
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

In [None]:
output

### BONUS

Plot the measurement using python

## Task 4: Redo Task 2-3 modelling observations using AVRO.

Redo Task 2 Using AVRO for Serializing/Deserializing