In [None]:
from __future__ import print_function, division
import os
import sys 

spark_home = os.environ['SPARK_HOME']
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [None]:
from pyspark.sql import Row, SparkSession

In [None]:
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, batchDuration=5)

In [None]:
lines = ssc.socketTextStream('localhost', 9999)
words = lines.flatMap(lambda line: line.split(" "))

words.pprint()

In [None]:
def process(time, rdd):
    print("========= %s =========" % str(time))

    def getSparkSessionInstance(sparkConf):
        if ('sparkSessionSingletonInstance' not in globals()):
            globals()['sparkSessionSingletonInstance'] = SparkSession\
                .builder\
                .config(conf=sparkConf)\
                .getOrCreate()
        return globals()['sparkSessionSingletonInstance']
    
    try:
        # Get the singleton instance of SparkSession
        spark = getSparkSessionInstance(rdd.context.getConf())

        # Convert RDD[String] to RDD[Row] to DataFrame
        rowRdd = rdd.map(lambda w: Row(word=w))
        wordsDataFrame = spark.createDataFrame(rowRdd)

        # Creates a temporary view using the DataFrame.
        wordsDataFrame.createOrReplaceTempView("words")

        # Do word count on table using SQL and print it
        wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word")
        wordCountsDataFrame.show()
    except Exception as e:
        print(e)

In [None]:
words.foreachRDD(process)

In [None]:
ssc.start()
ssc.awaitTermination()