# Load data

In [55]:
reddit = spark.read.parquet("/var/reddit-parquet") # these records are posts/Submissions (in PRAW) can tell by existence of 'num_comments', 'name', 'locked' & other attributes that are common to posts/Submissions 

In [5]:
type(reddit)

pyspark.sql.dataframe.DataFrame

In [6]:
len(reddit.columns)

70

In [7]:
reddit.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- approved_by: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- banned_by: string (nullable = true)
 |-- body: string (nullable = true)
 |-- body_html: string (nullable = true)
 |-- clicked: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- disable_comments: boolean (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- from: string (nullable = true)
 |-- from_id: string (nullable = true)
 |-- from_kind: string (nullable = true)
 |-- gilded: long 

In [8]:
record_count = reddit.count()

In [9]:
record_count # 2859977347; 1% of this is +28 million

2859977347

> ### The DataFrame we created has a fairly large number of columns (70), is deeply nested in several instances (up to 8 layers deep), and contains a significant number of records (+2.8 billion).

In [10]:
# select a subset of columns for EDA

these_cols = [
    "id",
    "parent_id",
    "subreddit",
    "author",
    "created_utc",
    "body",
    "num_comments",
    "score"
] # "created" is not as populated as created_utc, better to use created_utc

> ### For our initial EDA we primarily care about the columns indicated above so we'll subset the data appropriately.

In [11]:
cols_df = reddit.select(these_cols)
cols_df.dtypes

[('id', 'string'),
 ('parent_id', 'string'),
 ('subreddit', 'string'),
 ('author', 'string'),
 ('created_utc', 'string'),
 ('body', 'string'),
 ('num_comments', 'bigint'),
 ('score', 'bigint')]

> ### It's worthwhile to check the data types of the columns. You may notice that "created_utc" is a string. It actually contains the created date in Unix time/Epoch time (the number of seconds since January 1, 1970). We'll want to convert that to a more human readable format.

# Created date

In [12]:
from pyspark.sql.functions import from_unixtime

In [13]:
# add new col showing just created year
cols_df = cols_df.withColumn(
    "created_utc_year", 
    from_unixtime(
        cols_df["created_utc"], 
        "yyyy" # full timestamp: yyyy-MM-dd HH:mm:ss.SS
    )
)

In [14]:
years_df = cols_df.groupby("created_utc_year").count()
years_df.orderBy("created_utc_year").show()

+----------------+---------+
|created_utc_year|    count|
+----------------+---------+
|            null|        1|
|            2005|     1086|
|            2006|   419341|
|            2007|  2745064|
|            2008|  9773673|
|            2009| 23726352|
|            2010| 55571522|
|            2011|138398080|
|            2012|289689090|
|            2013|441953216|
|            2014|584765776|
|            2015|738997386|
|            2016|573936760|
+----------------+---------+



> ### The data covers 11 years from 2005 to 2016. Reddit was founded in 2005 and we can see a hint of the platform's growth just in terms of the number of posts in the dataset.

In [15]:
null_count = cols_df[cols_df["body"].isNull()].count()

In [16]:
null_count

279383793

In [17]:
round((float(null_count)/record_count)*100,2)

9.77

> ### While there are a large number of records with null values in "body" (+279 million), this is only ~10% of the overall data.

In [18]:
clean_df = cols_df[
    (cols_df["created_utc_year"].isNotNull())&
    (cols_df["body"].isNotNull())
]

In [19]:
clean_df.count() # 2,580,593,554

2580593554

> ### We are still left with +2.5 billion records!
> ### However, using all the records would be excessive and time-intensive.
> ### A stratified, one percent sample of the total should be sufficient. 

In [20]:
these_years = years_df.select("created_utc_year").distinct()

In [21]:
yearly_portion = these_years.count()/100.0

In [22]:
fraction_dict = {str(i["created_utc_year"]):yearly_portion for i in these_years.collect()}

In [23]:
# stratified sample
one_perct_df = clean_df.sampleBy(
    col = "created_utc_year",
    fractions = fraction_dict
)

In [56]:
one_perct_df.count() # ~333 million

333750701

In [25]:
one_perct_df.groupby("created_utc_year").count().orderBy("created_utc_year").show()

+----------------+--------+
|created_utc_year|   count|
+----------------+--------+
|            2005|     135|
|            2006|   54188|
|            2007|  318437|
|            2008|  937903|
|            2009| 2439411|
|            2010| 6274915|
|            2011|15954730|
|            2012|33677435|
|            2013|52037146|
|            2014|68783521|
|            2015|86458768|
|            2016|66814112|
+----------------+--------+



# Subreddits

In [26]:
# from pyspark.sql.functions import round, datediff, from_utc_timestamp

In [36]:
# add new col showing year month day
cols_df = cols_df.withColumn(
    "created_utc_yearMonthDay", 
    from_unixtime(
        cols_df["created_utc"], 
        "yyyy-MM-dd" # full timestamp: yyyy-MM-dd HH:mm:ss.SS
    )
)

In [37]:
yearsMonthDay_df = cols_df.groupby(["subreddit","created_utc_yearMonthDay"]).count()
avg_df = yearsMonthDay_df.groupby("subreddit").avg()
avg_df.orderBy("avg(count)", ascending = False).show(10)

+---------------+------------------+
|      subreddit|        avg(count)|
+---------------+------------------+
|      AskReddit| 82573.53356665606|
|          funny|21205.329404268876|
|           pics| 19387.07508749602|
|leagueoflegends|       19302.51375|
|  AdviceAnimals|17288.425298329355|
|     The_Donald| 14156.92824074074|
|         gaming| 12662.88141809291|
|       politics|11853.600784550392|
|            WTF| 11560.75676536135|
|      thebutton| 11310.20588235294|
+---------------+------------------+
only showing top 10 rows



In [96]:
max_df = yearsMonthDay_df.groupby("subreddit").max()
max_df.orderBy("max(count)", ascending = False).show(10)

+-----------------+----------+
|        subreddit|max(count)|
+-----------------+----------+
|        AskReddit|    217147|
|         politics|    156499|
|              nfl|    125586|
|           gaming|    122276|
|     pcmasterrace|    112017|
|        pokemongo|    107412|
|millionairemakers|    107299|
|              nba|    104318|
|             news|    103747|
|    SquaredCircle|    101708|
+-----------------+----------+
only showing top 10 rows



> ### These are extraordinarily high counts for daily posts, check when these occurred, may be impacting my stats here

In [115]:
these_subreddits = [
    "AskReddit","politics","nfl",
    "gaming","pcmasterrace","pokemongo",
    "millionairemakers","nba","news"
]
temp_df = yearsMonthDay_df.filter(yearsMonthDay_df.subreddit.isin(these_subreddits))

In [116]:
# yearsMonthDay_df.freqItems(['created_utc_yearMonthDay'], support = 0.01).show(3, False)
temp_df.freqItems(['created_utc_yearMonthDay'], support = 0.01).show(3,False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|created_utc_yearMonthDay_freqItems                                                                                                                                                                                      

In [118]:
# yearsMonthDay_df.where("created_utc_yearMonthDay='2015-12-13'").orderBy("count", ascending = False).show(10)
temp_df.where("created_utc_yearMonthDay='2010-05-24'").orderBy("count", ascending = False).show(10)

+---------+------------------------+-----+
|subreddit|created_utc_yearMonthDay|count|
+---------+------------------------+-----+
|AskReddit|              2010-05-24|23216|
| politics|              2010-05-24| 5269|
|   gaming|              2010-05-24| 4582|
|     news|              2010-05-24|  468|
|      nba|              2010-05-24|    5|
+---------+------------------------+-----+



In [120]:
these_dates = [
    "2010-05-24", "2009-02-18", "2011-05-11",
    "2010-07-12", "2008-04-19", "2009-04-03",
    "2008-03-17", "2009-12-01"    
]

In [125]:
temp_df.filter(temp_df.created_utc_yearMonthDay.isin(these_dates)).orderBy("count", ascending = False).show()

+---------+------------------------+-----+
|subreddit|created_utc_yearMonthDay|count|
+---------+------------------------+-----+
|AskReddit|              2011-05-11|49603|
|AskReddit|              2010-07-12|25401|
|AskReddit|              2010-05-24|23216|
|AskReddit|              2009-12-01|19060|
|   gaming|              2011-05-11|13371|
| politics|              2011-05-11| 9452|
|   gaming|              2010-07-12| 8816|
| politics|              2009-12-01| 6081|
| politics|              2010-05-24| 5269|
| politics|              2010-07-12| 5121|
|AskReddit|              2009-04-03| 4603|
|   gaming|              2010-05-24| 4582|
| politics|              2009-02-18| 4196|
|AskReddit|              2009-02-18| 3897|
|   gaming|              2009-12-01| 3338|
| politics|              2008-03-17| 3285|
| politics|              2009-04-03| 2302|
|   gaming|              2009-02-18| 1503|
| politics|              2008-04-19| 1308|
|      nba|              2011-05-11| 1087|
+---------+

# Can still evaluate distributions of scores & body_len for certain (common) Subreddits