## Event Producer 2 
### Simulating real-time data using Apache Kafka Producer

Required library: geohash
Installation command: pip3 install geohash


First, import all required libraries. 


In [1]:
import pymongo
from pymongo import MongoClient
from datetime import datetime, date
from pprint import pprint
import geohash

then initial the data type for each columns in input files.

In [2]:
integer_attr = ['air_temperature_celcius', 'surface_temperature_celcius', 'confidence', ]
float_attr = ['relative_humidity', 'windspeed_knots', 'max_wind_speed', 'latitude', 'longitude']
date_arrt = ['date']
datetime_attr = ['datetime']

The following function will read the CSV data and transform each record into a python dict by using the column name on the first line as a key for the python dictionary. Then all dictionary object of record will be stored in a list.


In [3]:
def csv2dict(file_path, delim):
    
    header_names = []
    is_header_first = True
    
    result = []
    
    for line in open(file_path):
        tmp_row = {}
        
        line = line.strip()
        
        if len(line) == 0:
            continue
            
        if is_header_first:
            header_names = line.split(delim)
            is_header_first = False
            continue
                
        tmp_data = line.split(delim)

        for i in range(len(header_names)):
            #the string data will be re-cating into the right data type using the column name and data type that declared on above  
            if header_names[i].strip() in integer_attr:
                tmp_row[header_names[i].strip()] = int(tmp_data[i].strip())
            elif header_names[i].strip() in float_attr:
                tmp_row[header_names[i].strip()] = float(tmp_data[i].strip())
            elif header_names[i].strip() in date_arrt:
                tmp_row[header_names[i].strip()] = datetime.strptime(tmp_data[i].strip(), '%d/%m/%Y')
            elif header_names[i].strip() in datetime_attr:
                tmp_row[header_names[i].strip()] = datetime.strptime(tmp_data[i].strip(), '%Y-%m-%dT%H:%M:%S')
            else :
                tmp_row[header_names[i].strip()] = tmp_data[i].strip()
                
        result.append(tmp_row)
        
    return result

In [4]:
hotspot_aqua_streaming = "./hotspot_AQUA_streaming.csv"
hotspot_aqua_streaming_dict = csv2dict(hotspot_aqua_streaming, ',')

In [5]:
for item in hotspot_aqua_streaming_dict:
    item.update({"sender_id": "2"})
    item.update({"geo_hash": geohash.encode(item["latitude"], item["longitude"], precision=5)})

#### Main Function

In [None]:
# import statements
from time import sleep
import json
from kafka import KafkaProducer
import random
import datetime

lines = hotspot_aqua_streaming_dict

def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8')
        value_bytes = bytes(value, encoding='utf-8')
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully. Data: ' + json.dumps(line))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
if __name__ == '__main__':
   
    topic = 'fire'
    
    print('Publishing records..')
    producer = connect_kafka_producer()
    while(True): 
        line = lines[random.randrange(len(lines))]
        line.update({"created_time":str(datetime.datetime.now())})  
#        print(line)
        publish_message(producer, topic, 'parsed', json.dumps(line))
        sec = random.randrange(10,31,1)
        sleep(int(sec))

Publishing records..
Message published successfully. Data: {"confidence": 50, "latitude": -37.3556, "created_time": "2019-05-24 15:17:48.746432", "geo_hash": "r1rgk", "sender_id": "2", "surface_temperature_celcius": 47, "longitude": 146.1085}
Message published successfully. Data: {"confidence": 79, "latitude": -36.2987, "created_time": "2019-05-24 15:18:16.764223", "geo_hash": "r1s3e", "sender_id": "2", "surface_temperature_celcius": 52, "longitude": 141.1294}
