In [None]:
from __future__ import print_function, division
import os
import sys 

spark_home = os.environ['SPARK_HOME']
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

from pyspark import SparkContext
from pyspark.streaming import StreamingContext


## create dstream from a port

In [None]:


# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, batchDuration=5)

In [None]:
# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)

In [None]:
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

In [None]:
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [None]:
# install netcat
# !yum install nc

# send data to the port 
# nc -l 9999

In [None]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

## listen to a folder

Create an input stream that monitors a Hadoop-compatible file system for new files and reads them as text files. 

Files must be wrriten to the monitored directory by “moving” them from another location within the same file system. 

File names starting with . are ignored.

In [None]:
!hadoop fs -mkdir /input

In [None]:
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, batchDuration=5)

In [None]:
lines = ssc.textFileStream("/input")

In [None]:
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
wordCounts.pprint()

In [None]:
ssc.start()             
ssc.awaitTermination()

## Listen to Kafka

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [None]:
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, batchDuration=5)

In [None]:
#!kafka-topics --create --topic t1 --zookeeper 0.0.0.0:2181 --partitions 1 --replication-factor 1
#!kafka-console-producer --topic t1 --broker-list 0.0.0.0:9092

In [None]:
brokers, topic = "0.0.0.0:9092",  "t1" 
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
counts.pprint()

 

In [None]:
ssc.start()
ssc.awaitTermination()