In [1]:
import json
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, from_unixtime
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

In [None]:
# Initialize Spark
spark_conf = SparkConf().setAppName('RedditPipeline')
ss1 = SparkSession.builder.config(conf=spark_conf).getOrCreate()

# Command to run spark server on docker to plug into kernel for running notebook
# docker run -it -p 4040:4040 -p 8080:8080 -p 8081:8081 -p 8888:8888 -p 5432:5432 --cpus=2 --memory=2048m -h spark -w /mnt/host_home/ pyspark_container jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root 

# Configuration
RAW_CHECKPOINT_PATH = './checkpoints/raw'
RAW_PATH = './data/raw'
METRICS_CHECKPOINT_PATH = './checkpoints/metrics'
METRICS_PATH = './data/metrics'

HOST = "host.docker.internal" # change to '127.0.0.1' if not using Docker
PORT = '9998'

USERS_REGEX = r'/u/([a-zA-Z0-9_-]+)' 
SUBREDDITS_REGEX = r'/r/([a-zA-Z0-9_-]+)' 
URLS_REGEX = r'(https?://[^\s]+)'

print(f"→ Connecting to producer at {HOST}:{PORT}")

# Define schema
schema = StructType([
    StructField('type', StringType(), True),
    StructField('subreddit', StringType(), True),
    StructField('id', StringType(), True),
    StructField('text', StringType(), True),
    StructField('created_utc', DoubleType(), True),
    StructField('author', StringType(), True)
])


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/08 22:05:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


→ Connecting to producer at host.docker.internal:9998


### Function to extract references

In [None]:
def extract_references(batch):
    """
    Extract references of users, subreddits and URLs from the text of each post or comment.
    """
    references = batch \
        .withColumn('users', F.regexp_extract_all(F.col('text'), F.lit(USERS_REGEX), 1)) \
        .withColumn('subreddits', F.regexp_extract_all(F.col('text'), F.lit(SUBREDDITS_REGEX), 1)) \
        .withColumn('urls', F.regexp_extract_all(F.col('text'), F.lit(URLS_REGEX), 1))

    references = references.withWatermark('created_utc', '1 minute') \
        .groupBy(F.window(F.col("timestamp"), windowDuration='60 seconds', slideDuration='5 seconds')) \
        .agg(
            F.collect_list('users').alias('users'),
            F.collect_list('subreddits').alias('subreddits'),
            F.collect_list('urls').alias('urls'),
            F.sum(F.size('users')).alias('users_count'),
            F.sum(F.size('subreddits')).alias('subreddits_count'),
            F.sum(F.size('urls')).alias('urls_count')
        )
    
    return references
        

### Custom batch processing function

In [4]:
def process_batch(batch_df, batch_id):
    print(f"==== Processing batch {batch_id} ====")

    # create temporary table for raw data and save it to disk
    batch_df.createOrReplaceTempView("raw")
    batch_df.write.mode("append").json(RAW_PATH)

    # convert created_utc to timestamp
    batch_df = batch_df.withColumn("timestamp", from_unixtime(col("created_utc")).cast("timestamp"))

    # TODO: Implement the references in a window of 60 seconds with a sliding window of 5 seconds
    references_df = extract_references(batch_df)
    references_df.show(5, truncate=False)
    # TODO: implement tf-idf to find the top 10 most relevant words in the text
    # TODO: perform sentiment analysis on the text and add a column with the sentiment score
    
    # create temporary table for metrics and save it to disk
    batch_df.createOrReplaceTempView("metrics")
    batch_df.write.mode("append").json(METRICS_PATH)

    batch_df.write \
        .mode("append") \
        .json(METRICS_PATH)



### Stream data from Producer into Consumer

In [5]:
# Read from socket without schema (will get string data)
streaming_df = ss1.readStream \
	.format("socket") \
	.option("host", HOST) \
	.option("port", PORT) \
	.load()
# Parse the JSON string and apply schema after loading

streaming_df = streaming_df.select(F.from_json(F.col("value"), schema).alias("data")).select("data.*")

25/06/08 22:05:50 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


### Start processing stream in batches

In [None]:

streaming_df.writeStream \
	.foreachBatch(process_batch) \
    .option("checkpointLocation", METRICS_CHECKPOINT_PATH) \
    .start()

25/06/08 22:05:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0xffc8c9807450>

==== Processing batch 0 ====


                                                                                

+------+-----+----------+----+
|window|users|subreddits|urls|
+------+-----+----------+----+
+------+-----+----------+----+

==== Processing batch 1 ====


                                                                                

+------------------------------------------+--------+----------+--------+
|window                                    |users   |subreddits|urls    |
+------------------------------------------+--------+----------+--------+
|{2025-06-08 18:50:35, 2025-06-08 18:51:35}|[[]]    |[[]]      |[[]]    |
|{2025-06-08 19:08:15, 2025-06-08 19:09:15}|[[]]    |[[]]      |[[]]    |
|{2025-06-08 19:57:05, 2025-06-08 19:58:05}|[[]]    |[[]]      |[[]]    |
|{2025-06-08 20:49:30, 2025-06-08 20:50:30}|[[], []]|[[], []]  |[[], []]|
|{2025-06-08 19:24:50, 2025-06-08 19:25:50}|[[]]    |[[]]      |[[]]    |
+------------------------------------------+--------+----------+--------+
only showing top 5 rows

==== Processing batch 2 ====
+------------------------------------------+------------+------------+------------+
|window                                    |users       |subreddits  |urls        |
+------------------------------------------+------------+------------+------------+
|{2025-06-08 21:29:15, 2025-

                                                                                

+------------------------------------------+-----+----------+----+
|window                                    |users|subreddits|urls|
+------------------------------------------+-----+----------+----+
|{2025-06-08 22:08:55, 2025-06-08 22:09:55}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:08:40, 2025-06-08 22:09:40}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:08:15, 2025-06-08 22:09:15}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:08:10, 2025-06-08 22:09:10}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:08:45, 2025-06-08 22:09:45}|[[]] |[[]]      |[[]]|
+------------------------------------------+-----+----------+----+
only showing top 5 rows

==== Processing batch 15 ====


                                                                                

+------------------------------------------+-----+----------+----+
|window                                    |users|subreddits|urls|
+------------------------------------------+-----+----------+----+
|{2025-06-08 22:10:15, 2025-06-08 22:11:15}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:09:45, 2025-06-08 22:10:45}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:09:35, 2025-06-08 22:10:35}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:10:20, 2025-06-08 22:11:20}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:10:25, 2025-06-08 22:11:25}|[[]] |[[]]      |[[]]|
+------------------------------------------+-----+----------+----+
only showing top 5 rows

==== Processing batch 16 ====


                                                                                

+------------------------------------------+-----+----------+----+
|window                                    |users|subreddits|urls|
+------------------------------------------+-----+----------+----+
|{2025-06-08 22:12:35, 2025-06-08 22:13:35}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:12:05, 2025-06-08 22:13:05}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:11:55, 2025-06-08 22:12:55}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:12:20, 2025-06-08 22:13:20}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:12:10, 2025-06-08 22:13:10}|[[]] |[[]]      |[[]]|
+------------------------------------------+-----+----------+----+
only showing top 5 rows

==== Processing batch 17 ====


                                                                                

+------------------------------------------+-----+----------+----+
|window                                    |users|subreddits|urls|
+------------------------------------------+-----+----------+----+
|{2025-06-08 22:13:35, 2025-06-08 22:14:35}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:13:20, 2025-06-08 22:14:20}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:13:40, 2025-06-08 22:14:40}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:13:05, 2025-06-08 22:14:05}|[[]] |[[]]      |[[]]|
|{2025-06-08 22:13:30, 2025-06-08 22:14:30}|[[]] |[[]]      |[[]]|
+------------------------------------------+-----+----------+----+
only showing top 5 rows



In [9]:
ss1.stop()