# Spark streaming: Kafka example

This is a simple example of how to run Spark Streaming jobs in Jupyter reading from Kafka.

To run this example, you should first start the Kafka and Publisher provided for project 2.

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import socket
from pyspark.streaming.kafka import KafkaUtils

sc = SparkContext("local[2]", "KafkaExample")
print("SparkContext created!")
ssc = StreamingContext(sc, 5)
print("StreamingContext created!")


In [None]:
def to_dict(item):
    event = item[-1].split(',')

    keys = [
        'medallion',
        'hack_license',
        'pickup_datetime',
        'dropoff_datetime',
        'trip_time_in_secs',
        'trip_distance',
        'pickup_longitude',
        'pickup_latitude',
        'dropoff_longitude',
        'dropoff_latitude',
        'payment_type',
        'fare_amount',
        'surcharge',
        'mta_tax',
        'tip_amount',
        'tolls_amount',
        'total_amount'
    ]

    return {keys[i]: event[i] for i in range(len(keys))}

In [6]:

lines = KafkaUtils.createDirectStream(ssc, ["debs"], \
            {"metadata.broker.list": "kafka:9092"})
# tut query

lines = lines.filter(lambda line : len(line) > 0 ) \
            .map(lambda line: to_dict(line)) \
            .map(lambda line: line['hack_license'] = 1)

lines.pprint()
ssc.start()

-------------------------------------------
Time: 2019-06-09 17:22:50
-------------------------------------------
('2683', '2683C4572D2006A6C542829617FB2D46,597357CE3A65EF16216BBE43043982F1,2013-01-01 00:02:17,2013-01-01 00:04:23,464,1.3,-73.972527,40.761963,-73.975861,40.750797,CRD,7.5,0.5,0.5,1.7,0.0,10.2')
('0FFE', '0FFE15BA6CC105C01B9DB68398518568,23745254EB40809002D44762C43AFB8D,2013-01-01 00:00:23,2013-01-01 00:02:30,574,1.8,-73.993858,40.720547,-73.992256,40.743668,CRD,9.0,0.5,0.5,3.0,0.0,13.0')
('0036', '0036961468659D0BFC7241D92E8ED865,AC46D4686265F12C404BE4A533EEDDDB,2013-01-01 00:00:20,2013-01-01 00:02:31,780,3.76,-74.00872,40.716019,-73.998558,40.764084,CSH,14.0,0.5,0.5,0.0,0.0,15.0')
('027A', '027A9A35F6EEA3047B7DDF386E17CE27,45A5CA2F8C1090B23F61DC0CC71328A5,2013-01-01 00:00:17,2013-01-01 00:02:31,960,4.63,-74.007347,40.73291,-73.97612,40.776356,CRD,17.0,0.5,0.5,1.5,0.0,19.5')
('0399', '0399AE27CC8809B4655FEEB775CB2D86,CC0F01DBEE1E959950FC1CA073F34EC8,2013-01-01 00:00:19,2

In [3]:
ssc.stop()
sc.stop()