In [1]:
from pyspark.sql import SparkSession
from operator import add
import json
import re
import datetime

# Initialize Spark Session (this code remains unchanged as requested)
spark_session = SparkSession.builder\
        .master("spark://spark-master:7077") \
        .appName("test2_Junming_Ma")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port", 9999)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

# Use the RDD API via spark_context
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/07 16:32:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Load the dataset from HDFS
DATA_PATH = 'data/reddit.json'
lines = spark_context.textFile(f"hdfs://spark-master:9000/{DATA_PATH}")
print("First line of file:", lines.take(1))

[Stage 0:>                                                          (0 + 1) / 1]

First line of file: ['{"author":"raysofdarkmatter","body":"I think it should be fixed on either UTC standard or UTC+1 year around, with the current zone offsets.\\n\\nMoving timescales add a lot of complexity to the implementation of timekeeping systems and have [dubious value]( \\n\\nI think seasonal shifting time made sense in the pre-electric past, when timekeeping was more flexible and artificial light was inefficient and often dangerous. \\n\\nNow we have machines that work easily with simple timekeeping rules, and it\'s more beneficial to spend a small amount on energy for lighting, and save the larger cost of engineering things to work with the complex timekeeping rules, as well as saving the irritation to humans.\\n\\nLighting has gotten much more efficient over time; we can squeeze out a lot more photons per unit of energy from a 2012 CFL or LED than a candle could in 1780, or a lightbulb could in 1950. \\n\\nThere\'s a lot of room for improvement in how we use lights as well;

                                                                                

In [3]:
# ------------------------------
# 1. Parse the JSON Lines
# ------------------------------
def parse_json(line):
    try:
        return json.loads(line)
    except Exception as e:
        return None

parsed_rdd = lines.map(parse_json).filter(lambda x: x is not None)
print("Parsed first record:", parsed_rdd.take(1))


Parsed first record: [{'author': 'raysofdarkmatter', 'body': "I think it should be fixed on either UTC standard or UTC+1 year around, with the current zone offsets.\n\nMoving timescales add a lot of complexity to the implementation of timekeeping systems and have [dubious value]( \n\nI think seasonal shifting time made sense in the pre-electric past, when timekeeping was more flexible and artificial light was inefficient and often dangerous. \n\nNow we have machines that work easily with simple timekeeping rules, and it's more beneficial to spend a small amount on energy for lighting, and save the larger cost of engineering things to work with the complex timekeeping rules, as well as saving the irritation to humans.\n\nLighting has gotten much more efficient over time; we can squeeze out a lot more photons per unit of energy from a 2012 CFL or LED than a candle could in 1780, or a lightbulb could in 1950. \n\nThere's a lot of room for improvement in how we use lights as well; as light

                                                                                

In [4]:
# Total number of comments
total_comments = parsed_rdd.count()
print("Total comments:", total_comments)

# Function to check if a field is missing or empty
def missing_value(record, field):
    return (field not in record) or (record[field] is None) or (record[field] == "")

# Count missing values in key fields: 'body', 'author', 'subreddit'
missing_body = parsed_rdd.filter(lambda r: missing_value(r, "body")).count()
missing_author = parsed_rdd.filter(lambda r: missing_value(r, "author")).count()
missing_subreddit = parsed_rdd.filter(lambda r: missing_value(r, "subreddit")).count()

print("Missing 'body':", missing_body)
print("Missing 'author':", missing_author)
print("Missing 'subreddit':", missing_subreddit)


25/03/07 16:33:48 ERROR TransportClient: Failed to send RPC RPC 5058954677989605909 to /192.168.2.135:45648: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
25/03/07 16:33:48 ERROR TransportClient: Failed to send RPC RPC 6620663130107817154 to /192.168.2.135:45648: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
                                                                                

Total comments: 3848330




Missing 'body': 0
Missing 'author': 0
Missing 'subreddit': 136


                                                                                

In [5]:
#Comment Length Analysis
# ------------------------------
# Compute the length of each comment (based on 'body')
comment_lengths = parsed_rdd.map(lambda r: len(r.get("body", "")))
length_stats = comment_lengths.stats()  # Returns count, mean, stdev, min, max
print("Comment Length Stats:")
print("  Count:", length_stats.count())
print("  Mean:", length_stats.mean())
print("  Stdev:", length_stats.stdev())
print("  Min:", length_stats.min())
print("  Max:", length_stats.max())



Comment Length Stats:
  Count: 3848330
  Mean: 1622.758239548064
  Stdev: 1542.569038521586
  Min: 12
  Max: 40216


                                                                                

In [6]:
# Count comments per subreddit
subreddit_counts = parsed_rdd.map(lambda r: (r.get("subreddit", "unknown"), 1)) \
                             .reduceByKey(add)
top_subreddits = subreddit_counts.takeOrdered(10, key=lambda x: -x[1])
print("Top 10 Subreddits by comment count:")
for sub, cnt in top_subreddits:
    print("  ", sub, cnt)



Top 10 Subreddits by comment count:
   AskReddit 589947
   relationships 352049
   leagueoflegends 109307
   tifu 52219
   relationship_advice 50416
   trees 47286
   gaming 43851
   atheism 43268
   AdviceAnimals 40783
   funny 40171


                                                                                

In [None]:
spark_session.stop()