## Load Reddit Comments Data into Parquet <a class="tocSkip">
This notebook loads the raw [Reddit comments dataset](http://academictorrents.com/details/85a5bd50e4c365f8df70240ffd4ecc7dec59912b) into a parquet file format. It does augment the data with several improved time columns, and the partitions the data by year/month/day. The file paths in this notebook should be modified for your system.

In [None]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql.utils as U
from pyspark.sql.window import Window as W

import pandas as pd

pd.set_option('display.max_colwidth', None)

spark = SparkSession\
        .builder\
        .appName("RedditCommentsLoadToParquet")\
        .getOrCreate()

In [None]:
reddit_comments_schema =  T.StructType([
    T.StructField("id", T.StringType()),
    T.StructField("parent_id", T.StringType()),
    T.StructField("author", T.StringType()),
    T.StructField("link_id", T.StringType()),
    T.StructField("subreddit", T.StringType()),
    T.StructField("subreddit_id", T.StringType()),
    T.StructField("edited", T.BooleanType()),
    T.StructField("score", T.LongType()),
    T.StructField("body", T.StringType()),
    T.StructField("created_utc", T.LongType()),
    T.StructField("retrieved_utc", T.LongType()),
    T.StructField("retrieved_on", T.LongType()),
])

In [None]:
import gc

spark.conf.set("spark.sql.session.timeZone", "UTC")

def has_column(df, col_name):
    if col_name in df.columns:
        return F.lit(True)
    else:
        return F.lit(False)

load_months = [
#     (2021, 7),
#     (2021, 8),
#     (2021, 9),
#     (2021, 10),
#     (2021, 11),
#     (2021, 12),
#     (2022, 1),
#     (2022, 2),
#     (2022, 3),
#     (2022, 4),
    (2022, 8),
]

for year, month in load_months:
    file_path = 'qfs:///data/reddit/comments/raw/RC_{0}-{1:02d}*.bz2'.format(year, month)
    print('loading data for year-month {0}-{1:02d} at file path {2}'.format(year, month, file_path))
    reddit_df = (
        spark.read.json(
            file_path,
            schema=reddit_comments_schema,
        )
        .withColumn(
            'retrieved_on',
            F.when(
                F.col('retrieved_utc').isNotNull(),
                F.col('retrieved_utc')
            ).otherwise(
                F.col('retrieved_on')
            )
        )
    )         

    reddit_finalized = (
        reddit_df
        .select(
            'author',
             'link_id',
            'retrieved_on',
            'subreddit',
            'subreddit_id',
            'id',
            'parent_id',
            'edited',
            'score',
            'body',
            'created_utc',
            F.from_unixtime('created_utc', 'yyyy-MM-dd').alias('created_date'),
            F.from_unixtime('created_utc', 'dd').alias('day')
        )
        .repartition('day')
    ).cache()
    print('    There are {0} total rows in month data set.'.format(reddit_finalized.count()))

    out_path = 'qfs:///data/reddit/comments/processed/year={0}/month={1:02d}'.format(year, month)
    print('    writing to: {0}'.format(out_path))
    reddit_finalized.write.partitionBy(
        'day'
    ).parquet(
        out_path,
        mode='overwrite'
    )
    print('\n')
    reddit_finalized.unpersist()
    del reddit_finalized
    del reddit_df
    gc.collect()
    

In [None]:
reddit_processed = spark.read.parquet('qfs:///data/reddit/comments/processed/')
reddit_processed.printSchema()

In [None]:
(
    reddit_processed
    .groupBy('year')
    .agg(
        F.count('*').alias('count'),
        F.countDistinct('author').alias('authors')
    )
    .orderBy('year')
).toPandas()

In [None]:
(
    reddit_processed
    .groupBy('year')
    .agg(
        F.count('*').alias('count'),
        F.countDistinct('author').alias('authors')
    )
    .orderBy('year')
).toPandas()

In [None]:
(
    reddit_processed
    .filter(
        (F.col('year') == 2022)
        &(F.col('month') == 8)
    )
    .groupBy('year','month','day')
    .agg(
        F.count('*').alias('count'),
        F.countDistinct('author').alias('authors')
    )
    .orderBy('year','month','day')
).toPandas()