In [None]:
import time

# Import necessary packages for spark, and processing the data with spark

from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, regexp_replace, lower, explode, trim, split, lit, row_number, year, dense_rank, desc, monotonically_increasing_id
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.window import Window

# In some cases we need to differentiate pyspark functions from built in functions

import pyspark.sql.functions as py_func

In [None]:
# Create a spark session

spark = SparkSession.builder\
    .master("spark://192.168.2.97:7077") \
    .appName("Question-3-Final")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
    .config("spark.shuffle.service.enabled", True)\
    .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
    .config("spark.executor.memory", "2048m")\
    .config("spark.cores.max", 8)\
    .getOrCreate()

In [None]:
start_time = time.time()

In [None]:
# Function for parsing the subreddit of the json

def parse_json_subreddit(file):
    
    # Read the json file
    df = spark.read.json(file)
    
    # Split the subreddit column into an array of strings
    df = df.withColumn("subreddit_array", split(df.subreddit, "\s+"))
    
    # Make a row for each subreddit
    df_subreddits = df.select(explode("subreddit_array").alias("subreddits")).filter("subreddits != 'reddit.com'")

    return df_subreddits

In [None]:
# The function that reterives the data from the HDFS and preprocess it

def get_word_count_and_df_all_years(years):
    # Create dictionary, schema, and a dataframe with that schema
    word_count = {}
    schema = StructType([StructField("subreddits", StringType(), True),])
    df_all_years = spark.createDataFrame([], schema)

    # Loop over the years
    for year in years:
        df_current_year = spark.createDataFrame([], schema)

        # Append all the months to the dataframe
        for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
            df_current_month = parse_json_subreddit("hdfs://192.168.2.97:50000/user/ubuntu/RC_20" + year + "-" + month + ".json")
            df_current_year = df_current_year.union(df_current_month)

        # Count and sort the words, save the result in word_count
        word_count_current_year = df_current_year.groupBy("subreddits").count().orderBy("count", ascending=False)
        word_count.update({"20" + year: word_count_current_year})

        # Add current year to the dataframe
        df_all_years = df_all_years.union(df_current_year)
    return word_count, df_all_years

In [None]:
# The years we are going to use
#years = ["06", "07", "08", "09", "10"]
years = ["07"] # For time test

# Get the word_count dictionary and the datframe
word_count, _ = get_word_count_and_df_all_years(years)

In [None]:
# Print unique words for each year
for year in years:
    print("The 10 most active subreddits for year: 20" + year)
    word_count["20" + year].show(10)

In [None]:
end_time = time.time()
print(f"Elapsed time: {end_time - start_time} seconds")

In [None]:
spark.stop()