In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
import os
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

In [4]:
from pyspark import SparkContext
sc = SparkContext(appName="Streaming predictions")
sc

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)
spark

In [6]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [7]:
from difflib import unified_diff

def make_diff(old, new):
    additions = []
    deletions = []
    generator = unified_diff(old.split(), new.split())
    for l in generator:
        if l.startswith('+'):
            additions.append(l[1:])
        elif l.startswith('-'):
            deletions.append(l[1:])
    additions = ' '.join(additions)
    deletions = ' '.join(deletions)
    return (additions, deletions)

In [8]:
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower

profanities = [s.strip() for s in open("/home/jimmy/Documents/courses/spark/profanities.txt", "r").readlines()]
profanities = sc.broadcast(profanities)

@udf("string")
def additions(old, new):
    (additions, _) = make_diff(old, new)
    return additions

@udf("string")
def deletions(old, new):
    (_, deletions) = make_diff(old, new)
    return deletions

@udf("long")
def longest_same_character_sequence(additions):
    ans, curr = 0, 1
    previous = None
    for c in additions:
        if c == previous:
            curr += 1
        else:
            curr = 1
        if curr > ans:
            ans = curr
        previous = c
    return ans

@udf("long")
def count_profanities(additions):
    count = 0
    additions = additions
    for profanity in profanities.value:
        count += additions.count(profanity)
    return count

def process_dataframe(df):
    return df \
        .withColumn("additions", lower(additions("text_old", "text_new"))) \
        .withColumn("deletions", lower(deletions("text_old", "text_new"))) \
        .drop("text_old") \
        .withColumn("profanities", count_profanities("additions")) \
        .withColumn("longest_same_character_sequence", longest_same_character_sequence("additions"))

In [9]:
globals()['models_loaded'] = False

text_columns = ['text_new', 'additions', 'deletions']
input_columns = text_columns + ['profanities','longest_same_character_sequence']
pipelines = {}

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=input_columns, outputCol="features")

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    df = process_dataframe(df)
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        from pyspark.ml.feature import StringIndexerModel, IndexToString
        from pyspark.ml import PipelineModel

        for col in text_columns:
            globals()[f'pipeline_{col}'] = PipelineModel.load(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_{col}')
        globals()[f'pipeline_label'] = StringIndexerModel.load(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_label')
        
        globals()['model'] = PipelineModel.load(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_classifier')
        globals()['models_loaded'] = True
        
    # Transform dataframe using the loaded pipelines
    for col in text_columns:
        df = globals()[f'pipeline_{col}'].transform(df).drop(col, 'temp1', 'temp2', 'temp3').withColumnRenamed('temp4', col)
    df = globals()[f'pipeline_label'].transform(df)
    
    df = assembler.transform(df)
    
    # And then predict using the loaded model: 
    df_result = globals()['model'].transform(df)
    df_result.show()

In [10]:
ssc = StreamingContext(sc, 10)

In [11]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [12]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+---------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+---------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|(unwanted edit)|unsafe|2001:bb6:a20:e558...|{{more footnotes|...|{{more footnotes|...|Clontarf and Hill...|//en.wikipedia.or...|
|               |  safe|           Genarians|{{Infobox person
...|{{Infobox person
...|    Micheline Presle|//en.wikipedia.or...|
+---------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+

+---------------+------+--------------------+--------------------+--------------------+-----------+-------------------------------+--------------------+--------------------+--------------------+------+------------------

+--------------------+------+-------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|    name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+-------------+--------------------+--------------------+--------------------+--------------------+
|                    |vandal|97.124.210.66|{{Infobox televis...|{{Infobox televis...|The Stranger (202...|//en.wikipedia.or...|
|                    |vandal|97.124.210.66|{{multiple issues...|{{multiple issues...|List of assets ow...|//en.wikipedia.or...|
|(Re-arranged endo...|unsafe|    Smart1965|{{Use dmy dates|d...|{{Use dmy dates|d...|2020 Conservative...|//en.wikipedia.or...|
|                    |  safe|    Genarians|{{Infobox person
...|{{Infobox person
...|      Bert I. Gordon|//en.wikipedia.or...|
+--------------------+------+-------------+--------------------+--------------------+-------------------

In [None]:
ssc_t.stop()