## Real time Spark Streaming Predictions

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [6]:
#From Stream, need to prepare the data for prediction
import sys
import os
import pandas as pd
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Enter model path
modelPath = "./models/lrmodel" #or nbmodel Naive Bayes Model
globals()['models_loaded'] = False

def process(time, rdd):

    if rdd.isEmpty():
            return  
    print("========= %s =========" % str(time))  
    
    # Convert to data frame and print rdd 
    df = spark.read.json(rdd)
    df.show()

    # Convert string label to integer label 
    indexer = StringIndexer(inputCol="label", outputCol="label_idx")
    label_indexer = indexer.fit(df)
    df = label_indexer.transform(df) 
    
    # Tokenize text, remove stopwords, and find difference 
    tokenizer1 = RegexTokenizer(inputCol='text_new', outputCol='words_new', pattern='\\s+') 
    tokenizer2 = RegexTokenizer(inputCol='text_old', outputCol='words_old', pattern='\\s+') #\\W
    remover1 = StopWordsRemover(inputCol='words_new', outputCol='terms_new')
    remover2 = StopWordsRemover(inputCol='words_old', outputCol='terms_old')
    df = tokenizer1.transform(df)
    df = tokenizer2.transform(df)
    df = remover1.transform(df)
    df = remover2.transform(df)
    differencer=udf(lambda x,y: list(set(x)-set(y)), ArrayType(StringType()))
    df = df.withColumn('diff', differencer('terms_new', 'terms_old'))
    
    # Load the model if not already loaded 
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = PipelineModel.load(modelPath)
        globals()['models_loaded'] = True

    # Predict using loaded model  
    df_result = globals()['my_model'].transform(df)   
    
    # Retrieve label from index and print prediction results 
    converter = IndexToString(inputCol="prediction", outputCol="predLabel", labels=label_indexer.labels)
    df_result = converter.transform(df_result)
    df_result.select('label', 'diff', 'features', 'rawPrediction', 'probability', 'predLabel').show()
    

In [7]:
ssc = StreamingContext(sc, 10)

In [8]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [10]:
ssc_t = StreamingThread(ssc)
ssc_t.start() # may take some time to start up 

In [11]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+-------+------+------------+--------------------+--------------------+-----------+--------------------+
|comment| label|   name_user|            text_new|            text_old| title_page|            url_page|
+-------+------+------------+--------------------+--------------------+-----------+--------------------+
|       |unsafe|46.7.158.182|{{Automatic taxob...|{{Automatic taxob...|Potter wasp|//en.wikipedia.or...|
+-------+------+------------+--------------------+--------------------+-----------+--------------------+

+------+--------------+--------------------+--------------------+--------------------+---------+
| label|          diff|            features|       rawPrediction|         probability|predLabel|
+------+--------------+--------------------+--------------------+--------------------+---------+
|unsafe|[ass, '''lick]|(1024,[172,834],[...|[2.04575195654470...|[0.83669357967945...|   unsafe|
+------+--------------+-----------