In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark_session = SparkSession.builder \
    .master("spark://192.168.2.47:7077") \
    .appName("Group40_Project") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.memory", "5G") \
    .config("spark.cores.max", 12) \
    .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/13 02:31:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/13 02:31:49 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


## Loading the data as a DataFrame

In [3]:
# Loading the dataset without the corrupted values
df = spark_session.read.option("mode", "DROPMALFORMED").json("hdfs://192.168.2.47:9000/data-project/corpus-webis-tldr-17.json")

                                                                                

In [4]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df.show(10)

+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|            author|                body|             content|content_len|     id|      normalizedBody|           subreddit|subreddit_id|             summary|summary_len|   title|
+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|  raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|                math|    t5_2qh0n|Shifting seasonal...|          8|    NULL|
|           Stork13|Art is about the ...|Art is about the ...|        148|c6a9nxd|Art is about the ...|               funny|    t5_2qh33|Personal opinions...|          4|    NULL|
|     Cloud_dreamer|Ask me what I thi...|Ask me what I thi...|         76|c6acx4l|Ask me what I thi.

In [6]:
df_prep = df.drop('body', 'content', 'id', 'subreddit_id', 'title')
df_prep = df_prep.withColumn("content_len", F.col("content_len").cast("int"))
df_prep = df_prep.withColumn("summary_len", F.col("summary_len").cast("int"))

In [7]:
df_prep.printSchema()

root
 |-- author: string (nullable = true)
 |-- content_len: integer (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: integer (nullable = true)



In [8]:
df_prep.show(10)

+------------------+-----------+--------------------+--------------------+--------------------+-----------+
|            author|content_len|      normalizedBody|           subreddit|             summary|summary_len|
+------------------+-----------+--------------------+--------------------+--------------------+-----------+
|  raysofdarkmatter|        178|I think it should...|                math|Shifting seasonal...|          8|
|           Stork13|        148|Art is about the ...|               funny|Personal opinions...|          4|
|     Cloud_dreamer|         76|Ask me what I thi...|         Borderlands|insults and slack...|         73|
|     NightlyReaper|        213|In Mechwarrior On...|            gamingpc|Yes, Joysticks in...|         19|
|    NuffZetPand0ra|        404|You are talking a...|              Diablo|Class only items ...|          7|
|beatlecreedcabaret|        130|All but one of my...|   RedditLaqueristas|      OPI Nail Envy!|          3|
|      nobodysdiary|        

In [9]:
df_prep.createTempView("reddit")

In [10]:
spark_session.sql(
    """
    SELECT subreddit, ROUND(AVG(content_len), 2)
    FROM reddit
    GROUP BY subreddit
    """
).show(10)



+--------------------+--------------------------+
|           subreddit|round(avg(content_len), 2)|
+--------------------+--------------------------+
|               anime|                    235.91|
|          MensRights|                    289.63|
|              travel|                    263.82|
|londonfootballmeetup|                     284.5|
|               HPMOR|                    285.16|
|     youtubecomments|                    157.13|
|        SaltLakeCity|                    190.72|
| UnresolvedMysteries|                    391.01|
|          MLBTheShow|                    215.68|
|           metro2033|                    101.63|
+--------------------+--------------------------+
only showing top 10 rows



                                                                                