In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, regexp_replace, lower, explode, trim, split
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# The code can currently not run with all years and months selected
years = ["06", "07", "08", "09", "10"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
#months = ["01"]

In [3]:
spark = SparkSession.builder\
    .master("spark://192.168.2.97:7077") \
    .appName("claude_carlsson_hdfs")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
    .config("spark.shuffle.service.enabled", True)\
    .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
    .config("spark.cores.max", 5)\
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/13 09:33:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def parse_json_body_to_string(json_filename):
    # Read the json file
    df = spark.read.json(json_filename)

    # Make everything lowercase
    my_df_lower = df.select(lower("body").alias("lowercase_body"))

    # Remove all charachters that exist in charachters_to_delete
    charachters_to_delete = "[\.,\[\]\(\):_\-!?\'\+=;/&{}@$#*\"\\\\%><|~¨´¤]"

    my_df_clean = my_df_lower.withColumn("lowercase_body", regexp_replace(my_df_lower.lowercase_body, charachters_to_delete, ""))

    # Parse the body to words
    my_df_words = my_df_clean.select(explode(split("lowercase_body", "\s+")).alias("word"))

    return my_df_words

In [5]:
def get_word_count_and_df_all_years():
    word_count = {}
    empty_schema = StructType([StructField("word", StringType(), True),])
    df_all_years = spark.createDataFrame([], empty_schema)

    for year in years:
        df_this_year = spark.createDataFrame([], empty_schema)

        # Append all the months to the dataframe
        for month in months:
            df_month = parse_json_body_to_string("hdfs://192.168.2.97:50000/user/ubuntu/RC_20" + year + "-" + month + ".json")
            df_this_year = df_this_year.union(df_month)

        # Count and sort the words. Save the result in word_count
        word_count_this_year = df_this_year.groupBy("word").count().orderBy("count", ascending=False)
        year_string = "20" + year
        word_count.update({year_string: word_count_this_year})

        # Sort df_this_year and update df_all_years
        df_all_years = df_all_years.union(df_this_year)
    return word_count, df_all_years

In [6]:
# Get the dictionaries
word_count, df_all_years = get_word_count_and_df_all_years()

                                                                                

In [7]:
# Find top 1000 most used words
word_count_all_years = df_all_years.groupBy("word").count().orderBy("count", ascending=False)
most_common_words = word_count_all_years.limit(500)

# Remove the most common words from the word count for each year
word_count_unique = {}
for year in years:
    year_string = "20" + year
    word_count_unique[year_string] = word_count[year_string].join(most_common_words, ["word"], "leftanti")
    # word_count_unique[year_string] = word_count_unique[year_string].sort('count', ascending=False)

In [8]:
most_common_words.show(10)
word_count_unique["2006"].show(10)
word_count_unique["2007"].show(10)
word_count_unique["2008"].show(10)
word_count_unique["2009"].show(10)
word_count_unique["2010"].show(10)

                                                                                

+----+---------+
|word|    count|
+----+---------+
| the|100762469|
|  to| 65846163|
|   a| 60236289|
|   i| 53540807|
| and| 50730510|
|  of| 46658892|
| you| 37819126|
|that| 37324476|
|  is| 35889144|
|  it| 34511854|
+----+---------+
only showing top 10 rows



                                                                                

+--------+-----+
|    word|count|
+--------+-----+
| removed|32413|
|language| 8955|
|    code| 8034|
|    bush| 7574|
|    page| 6739|
|  israel| 5939|
|       c| 5932|
| perhaps| 5583|
|  google| 5560|
|    lisp| 5411|
+--------+-----+
only showing top 10 rows



                                                                                

+--------+------+
|    word| count|
+--------+------+
| removed|208600|
|    blog| 47863|
|    paul| 46595|
|    spam| 44608|
|    bush| 40833|
|     ron| 36043|
|    vote| 31417|
|    code| 30274|
|  states| 30000|
|language| 28938|
+--------+------+
only showing top 10 rows



                                                                                

+---------+------+
|     word| count|
+---------+------+
|    obama|174574|
|     vote|121155|
|   mccain| 99636|
|     bush| 85053|
|  america| 71386|
|   market| 69809|
|   states| 69174|
|     paul| 65798|
|president| 63432|
|   rights| 60091|
+---------+------+
only showing top 10 rows



                                                                                

+-----------+------+
|       word| count|
+-----------+------+
|     health|167849|
|    science|141079|
|      obama|134412|
|     market|133046|
|    perhaps|133007|
|      words|132869|
|   comments|128901|
|      woman|128162|
|information|125155|
|     cannot|124672|
+-----------+------+
only showing top 10 rows





+--------+------+
|    word| count|
+--------+------+
|     hit|335308|
|    wish|325164|
|together|321631|
|    huge|319151|
|   close|317905|
|    body|315029|
|     lol|314831|
|    dude|312322|
|   woman|311797|
|    past|311009|
+--------+------+
only showing top 10 rows



                                                                                

In [9]:
spark.stop()