In [None]:
import time

# Import necessary packages for spark, and processing the data with spark

from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, regexp_replace, lower, explode, trim, split
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
# Create a spark session

spark = SparkSession.builder\
    .master("spark://192.168.2.97:7077") \
    .appName("Question-1-Final")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
    .config("spark.shuffle.service.enabled", True)\
    .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
    .config("spark.executor.memory", "2048m")\
    .config("spark.cores.max", 8)\
    .getOrCreate()

In [None]:
start_time = time.time()

In [None]:
# Function for parsing the body of the json

def parse_json_body(file):
    
    # Read the json file
    df = spark.read.json(file)

    # Lowercase the file
    df_lowercase_body = df.select(lower("body").alias("lowercase_body"))

    # String of characters to delete in the body
    charachters_to_delete = "[\.,\[\]\(\):_\-!?\'\+=;/&{}@$#*\"\\\\%><|~¨´¤]"

    # Remove characters from body
    df_clean = df_lowercase_body.withColumn("lowercase_body", regexp_replace(df_lowercase_body.lowercase_body, charachters_to_delete, ""))

    # Split body into words and make a row for each word
    df_words = df_clean.select(explode(split("lowercase_body", "\s+")).alias("word"))

    return df_words

In [None]:
# The function that reterives the data from the HDFS and preprocess it

def get_word_count_and_df_all_years(years):
    # Create dictionary, schema, and a dataframe with that schema
    word_count = {}
    schema = StructType([StructField("word", StringType(), True),])
    df_all_years = spark.createDataFrame([], schema)

    # Loop over the years
    for year in years:
        df_current_year = spark.createDataFrame([], schema)

        # Append all the months to the dataframe
        for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
            df_current_month = parse_json_body("hdfs://192.168.2.97:50000/user/ubuntu/RC_20" + year + "-" + month + ".json")
            df_current_year = df_current_year.union(df_current_month)

        # Count and sort the words, save the result in word_count
        word_count_current_year = df_current_year.groupBy("word").count().orderBy("count", ascending=False)
        word_count.update({"20" + year: word_count_current_year})

        # Sort df_current_year and update df_all_years
        df_all_years = df_all_years.union(df_current_year)
    return word_count, df_all_years

In [None]:
# The years we are going to use
#years = ["06", "07", "08", "09", "10"]
years = ["07"] # For time test

# Get the word_count dictionary and the datframe
word_count, df_all_years = get_word_count_and_df_all_years(years)

In [None]:
# Find top 500 most used words
word_count_all_years = df_all_years.groupBy("word").count().orderBy("count", ascending=False)
most_common_words = word_count_all_years.limit(500)

# Remove the most common words from the word count for each year
word_count_unique = {}
for year in years:
    word_count_unique["20" + year] = word_count["20" + year].join(most_common_words, ["word"], "leftanti")

# Print the most common words for all the years
print("The 10 most common words for all years")
most_common_words.show(10)

# Print unique words for each year
for year in years:
    print("The 10 most semi-unique words for year: 20" + year)
    word_count_unique["20" + year].show(10)

In [None]:
end_time = time.time()
print(f"Elapsed time: {end_time - start_time} seconds")

In [None]:
spark.stop()