In [22]:
import json
import uuid
import heapq
import time
from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin.new_topic import NewTopic
from kafka.errors import TopicAlreadyExistsError

### Configuration Parameters 

> **TODO:** Change the configuration prameters to the appropriate values for your setup.

In [3]:
config = dict(
    bootstrap_servers=['kafka.kafka.svc.cluster.local:9092'],
    first_name='Gabriel',
    last_name='Avinaz'
)

config['client_id'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)
config['topic_prefix'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)

config

{'bootstrap_servers': ['kafka.kafka.svc.cluster.local:9092'],
 'first_name': 'Gabriel',
 'last_name': 'Avinaz',
 'client_id': 'AvinazGabriel',
 'topic_prefix': 'AvinazGabriel'}

### Create Topic Utility Function

The `create_kafka_topic` helps create a Kafka topic based on your configuration settings.  For instance, if your first name is *John* and your last name is *Doe*, `create_kafka_topic('locations')` will create a topic with the name `DoeJohn-locations`.  The function will not create the topic if it already exists. 

In [4]:
def create_kafka_topic(topic_name, config=config, num_partitions=1, replication_factor=1):
    bootstrap_servers = config['bootstrap_servers']
    client_id = config['client_id']
    topic_prefix = config['topic_prefix']
    name = '{}-{}'.format(topic_prefix, topic_name)
    
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers, 
        client_id=client_id
    )
    
    topic = NewTopic(
        name=name,
        num_partitions=num_partitions,
        replication_factor=replication_factor
    )

    topic_list = [topic]
    try:
        admin_client.create_topics(new_topics=topic_list)
        print('Created topic "{}"'.format(name))
    except TopicAlreadyExistsError as e:
        print('Topic "{}" already exists'.format(name))
    
create_kafka_topic('locations')
create_kafka_topic('accelerations')

Topic "AvinazGabriel-locations" already exists
Topic "AvinazGabriel-accelerations" already exists


### Kafka Producer

The following code creates a `KafkaProducer` object which you can use to send Python objects that are serialized as JSON.

**Note:** This producer serializes Python objects as JSON. This means that object must be JSON serializable.  As an example, Python `DateTime` values are not JSON serializable and must be converted to a string (e.g. ISO 8601) or a numeric value (e.g. a Unix timestamp) before being sent.

In [5]:
producer = KafkaProducer(
  bootstrap_servers=config['bootstrap_servers'],
  value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

### Send Data Function

The `send_data` function sends a Python object to a Kafka topic. This function adds the `topic_prefix` to the topic so `send_data('locations', data)` sends a JSON serialized message to `DoeJohn-locations`. The function also registers callbacks to let you know if the message has been sent or if an error has occured. 

In [6]:
def on_send_success(record_metadata):
    print('Message sent:\n    Topic: "{}"\n    Partition: {}\n    Offset: {}'.format(
        record_metadata.topic,
        record_metadata.partition,
        record_metadata.offset
    ))
    
def on_send_error(excp):
    print('I am an errback', exc_info=excp)
    # handle exception

def send_data(topic, data, config=config, producer=producer, msg_key=None):
    topic_prefix = config['topic_prefix']
    topic_name = '{}-{}'.format(topic_prefix, topic)
    
    if msg_key is not None:
        key = msg_key
    else:
        key = uuid.uuid4().hex
    
    producer.send(
        topic_name, 
        value=data,
        key=key.encode('utf-8')
    ).add_callback(on_send_success).add_errback(on_send_error)

In [7]:
data_dir = "../../../data/processed/bdd/"
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType



In [15]:
spark = SparkSession.builder.appName("ParquetToLocations").getOrCreate()
locations_df = spark.read.format("parquet").load(data_dir + "locations/")
locations_df = locations_df.sort("t")
locations_df.show(10)

+--------------------+--------------------+--------------------+--------------------+------------------+-------------+------------------+------------------+------------+------------------+--------+---------+--------------------+----+
|                  id|             ride_id|                uuid|           timestamp|            offset|       course|          latitude|         longitude|     geohash|             speed|accuracy|timelapse|            filename|   t|
+--------------------+--------------------+--------------------+--------------------+------------------+-------------+------------------+------------------+------------+------------------+--------+---------+--------------------+----+
|85c61911b7fe2ced1...|6760ffa3f41908695...|dad7eae44e784b549...|1970-01-01 00:25:...|1.0779125295566454|   158.203125|   40.677641336844|-73.81793000742218|dr5x2jpkmtcy| 2.119999885559082|    10.0|    false|d745b92f-aefd-467...| 0.0|
|58682c5d48cad9d9e...|c9a2b46c9aa515b63...|19b9aa10588646b3b...|

In [20]:
t_locations_df = [row.t for row in locations_df.select("t").collect()]
t_locations_df[:10]

[0.0, 0.0, 4.5, 4.5, 7.8, 7.8, 10.6, 10.6, 10.6, 14.9]

In [21]:
locations_json = locations_df.toJSON().collect()
locations_json[:10]

['{"id":"85c61911b7fe2ced1000c33c9e932706","ride_id":"6760ffa3f41908695d1405b776c3e8d5","uuid":"dad7eae44e784b549c8c5a3aa051a8c7","timestamp":"1970-01-01T00:25:07.320Z","offset":1.0779125295566454,"course":158.203125,"latitude":40.677641336844,"longitude":-73.81793000742218,"geohash":"dr5x2jpkmtcy","speed":2.119999885559082,"accuracy":10.0,"timelapse":false,"filename":"d745b92f-aefd-467d-9121-7a71308e8d6d.mov","t":0.0}',
 '{"id":"58682c5d48cad9d9e103431d773615bf","ride_id":"c9a2b46c9aa515b632eddc45c4868482","uuid":"19b9aa10588646b3bf22c9b4865a7995","timestamp":"1970-01-01T00:25:03.882Z","offset":1.525060886522843,"course":299.619140625,"latitude":40.76287002542555,"longitude":-73.96194855681718,"geohash":"dr5ruuwscttz","speed":0.0,"accuracy":10.0,"timelapse":false,"filename":"e2f795a7-6a7d-4500-b5d7-4569de996811.mov","t":0.0}',
 '{"id":"85c61911b7fe2ced1000c33c9e932706","ride_id":"6760ffa3f41908695d1405b776c3e8d5","uuid":"dad7eae44e784b549c8c5a3aa051a8c7","timestamp":"1970-01-01T00:25:

In [None]:
for row in locations_json:
    send_data("locations",json.loads(row))

In [43]:
heap = []
for row in list(zip(locations_json, t_locations_df)):
    heapq.heappush(heap, (time.time() + row[1], row[0]))
# heapq.heapify(heap)


In [45]:
while heap:
    # get the next object and its associated time
    next_obj = heap[0]
    next_time = next_obj[0]

    # check if the associated time has elapsed
    if time.time() >= next_time:
        # pop the object from the heapq
        obj = heapq.heappop(heap)[1]
        send_data('locations', obj)

    # sleep for a short amount of time to avoid busy waiting
    time.sleep(0.1)

Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 479
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 480
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 481
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 482
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 483
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 484
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 485
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 486
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 487
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 488
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 489
Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offs

KeyboardInterrupt: 

In [41]:
while heap:
    # Get the object with the smallest time value
    time_value, object_value = heapq.heappop(heap)
    
    # Calculate the time difference between the current time and the object's time
    time_diff = time_value - current_time
    
    # If the time difference is negative, it means the object's time has already passed, so skip it
    if time_diff < 0:
        continue
    time.sleep(time_diff)
    print(object_value)

In [6]:
example_data = dict(
    key1='value1',
    key2='value2'
)

send_data('locations', example_data)

Message sent:
    Topic: "AvinazGabriel-locations"
    Partition: 0
    Offset: 0
