In [27]:
reddit = spark.read.parquet("/var/reddit-parquet") # believe these are Subreddit Submissions, Comments (children of Submissions) do not seem to be included

In [14]:
type(reddit)

pyspark.sql.dataframe.DataFrame

In [19]:
len(reddit.columns)

70

In [15]:
reddit.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- approved_by: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- banned_by: string (nullable = true)
 |-- body: string (nullable = true)
 |-- body_html: string (nullable = true)
 |-- clicked: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- disable_comments: boolean (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- from: string (nullable = true)
 |-- from_id: string (nullable = true)
 |-- from_kind: string (nullable = true)
 |-- gilded: long 

In [123]:
reddit.count() # 1% of this is +28 million

2859977347

> ### The DataFrame we created has a fairly large number of columns (70), is deeply nested in several instances (up to 8 layers deep), and contains a significant number of records (+2 billion).

In [62]:
# select a subset of columns for EDA

these_cols = [
    "id",
    "parent_id",
    "subreddit",
    "author",
    "created",
    "body",
    "num_comments",
    "score"
]

In [111]:
cols_df = reddit.select(these_cols)
cols_df.dtypes

[('id', 'string'),
 ('parent_id', 'string'),
 ('subreddit', 'string'),
 ('author', 'string'),
 ('created', 'bigint'),
 ('body', 'string'),
 ('num_comments', 'bigint'),
 ('score', 'bigint')]

In [112]:
from pyspark.sql.functions import from_unixtime

In [113]:
# convert "created" from bigint to string

cols_df = cols_df.withColumn(
    "created", 
    cols_df["created"].cast("string")
)

In [114]:
# add new col showing just created year

cols_df = cols_df.withColumn(
    "created_year", 
    from_unixtime(
        cols_df["created"], 
        "yyyy" # full timestamp: yyyy-MM-dd HH:mm:ss.SS
    )
)

### take stratified sample of data (stratified by year)

In [126]:
# stratify by created year

# cols_df.sampleBy("created_year", {1: 0.01}).count()

### show min start & max end dates (before removing records)
### show subreddits by count by year (before removing records)

In [130]:
years_df = cols_df.groupby("created_year").count()
years_df.orderBy("created_year").show()

+------------+----------+
|created_year|     count|
+------------+----------+
|        null|2694014185|
|        2006|      1817|
|        2007|    279724|
|        2008|   2527732|
|        2009|   4854283|
|        2010|   7064885|
|        2011|  15047383|
|        2012|  18504969|
|        2013|     24296|
|        2014|  52893169|
|        2015|  64764904|
+------------+----------+



> ### The data covers 9 years from 2006 to 2015. Although a significant portion of the data is missing a created year. There are also years that have several orders of magnitude fewer records than other years (e.g. 2006 and 2013).

### find empty or questionable records we don't want to include (e.g. id = NaN & _corrupt_record = True) 

### check how many records have body = [deleted] & author = "None"

### show subreddits by count by year