# Reddit Spark Streaming Consumer
This notebook receives Reddit posts/comments from a socket, stores them to a Spark table, and computes metrics such as reference counts, TF-IDF top words, and sentiment analysis.

In [None]:
import json
import re
from textblob import TextBlob
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

### Set-up of Spark Streaam Consumer and Data Schema structure.
##### See command to initialize spark server inside code cell. 

In [None]:
spark = SparkSession.builder.appName('RedditConsumer').getOrCreate()

# Command to run spark server on docker to plug into kernel for running notebook
# docker run -it -p 4040:4040 -p 8080:8080 -p 8081:8081 -p 8888:8888 -p 5432:5432 --cpus=2 --memory=2048m -h spark -w /mnt/host_home/ pyspark_container jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root

HOST = 'host.docker.internal'
PORT = 9998

schema = StructType([
    StructField('type', StringType()),
    StructField('subreddit', StringType()),
    StructField('id', StringType()),
    StructField('text', StringType()),
    StructField('created_utc', DoubleType()),
    StructField('author', StringType())
])

raw_lines = (spark
    .readStream
    .format('socket')
    .option('host', HOST)
    .option('port', PORT)
    .load())

json_df = raw_lines.select(F.from_json(F.col('value'), schema).alias('data')).select('data.*')

### Write data to memory

In [None]:
query_memory = (json_df
    .writeStream
    .outputMode('append')
    .format('memory')
    .queryName('raw')
    .start())

### Get reference to users, subreddits and URLs

In [None]:
# Get reference to users, subreddits, and URLs in the text by using regex
user_refs = F.regexp_extract_all("text", r"/u/[^\s]+")
subreddit_refs = F.regexp_extract_all("text", r"/r/[^\s]+")
url_refs = F.regexp_extract_all("text", r"https?://[^\s]+")

### Create dataframes of references on a sliding window basis.

In [None]:
# get the count of each type of reference and tag them with a created timestamp
# for time based filtering and aggregation
refs_df = json_df.select(
    F.col('created_utc').cast('timestamp').alias('created_ts'),
    F.size(user_refs).alias('user_ref_count'),
    F.size(subreddit_refs).alias('subreddit_ref_count'),
    F.size(url_refs).alias('url_ref_count')
)

In [None]:
# get the total references per time window (60 seconds with a 5 second slide)
windowed_refs = (refs_df
    .withWatermark('created_ts', '1 minute')
    .groupBy(F.window('created_ts', '60 seconds', '5 seconds'))
    .sum('user_ref_count', 'subreddit_ref_count', 'url_ref_count')
)

In [None]:
# Write the windowed reference counts to an in-memory table and to Parquet files
ref_query = (windowed_refs
    .writeStream
    .outputMode('update')
    .format('console')
    .option('truncate', False)
    .start())

### Function to compute TF-IDF and find top 10 most important words in the window.

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

def compute_tfidf():
    raw_df = spark.sql('select * from raw')
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words_data = tokenizer.transform(raw_df)
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    filtered = remover.transform(words_data)
    hashingTF = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=10000)
    featurized = hashingTF.transform(filtered)
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idf_model = idf.fit(featurized)
    tfidf = idf_model.transform(featurized)
    zipped = tfidf.select(F.explode(F.arrays_zip('filtered', 'features')).alias('z'))
    word_scores = zipped.select(F.col('z.filtered').alias('word'), F.col('z.features').alias('score'))
    top_words = word_scores.groupBy('word').agg(F.max('score').alias('score')).orderBy(F.desc('score')).limit(10)
    top_words.show(truncate=False)


### TextBlob function to achieve sentiment analysis of text.

In [None]:
@udf(returnType=DoubleType())
def sentiment_udf(text):
    return TextBlob(text).sentiment.polarity if text else 0.0

#### Batch Processing of Streaming Data.
- TODO:
    - Requires references in window created previously
    - Requires top 10 words in TF-IDF
    - Write data to processed memory

In [None]:
def process_batch(batch_df):
    batch_df.cache()
    batch_df.createOrReplaceTempView('current_batch')
    count = batch_df.count()
    print(f'Processing batch with {count} records')

    refs = (batch_df.select(
                F.regexp_extract_all('text', r'/u/\w+').alias('users'),
                F.regexp_extract_all('text', r'/r/\w+').alias('subs'),
                F.regexp_extract_all('text', r'https?://[^\s]+').alias('urls'))
            .select(
                F.size('users').alias('user_refs'),
                F.size('subs').alias('sub_refs'),
                F.size('urls').alias('url_refs')))
    refs_summary = refs.groupBy().sum('user_refs', 'sub_refs', 'url_refs')
    refs_summary.show(truncate=False)
    refs_summary.createOrReplaceTempView('batch_references')

    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(batch_df)
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    filtered = remover.transform(words)
    hashingTF = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=10000)
    featurized = hashingTF.transform(filtered)
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idf_model = idf.fit(featurized)
    tfidf = idf_model.transform(featurized)

    zipped = tfidf.select(F.explode(F.arrays_zip('filtered','features')).alias('z'))
    word_scores = zipped.select(F.col('z.filtered').alias('word'), F.col('z.features').alias('score'))
    top_words = word_scores.groupBy('word').agg(F.max('score').alias('score')).orderBy(F.desc('score')).limit(10)
    top_words.show(truncate=False)
    top_words.createOrReplaceTempView('batch_tfidf')


In [None]:
process_query = (json_df
    .writeStream
    .foreachBatch(process_batch)
    .start())

### Terminating all streams.

In [None]:
# Await termination of all streams
for q in [query_memory, ref_query, process_query]:
    q.awaitTermination()