# <font color='magenta'>EDA on Reddit 1% Sample </font>

In [25]:
import pandas as pd

ImportError: No module named 'pandas'

In [24]:
!pip install --user pandas

[33mYou are using pip version 18.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
one_perct_df = spark.read.parquet("one_perct_sample.parquet")

> The data is a one percent sample from the Reddit parquet file containing +2.8 billion records.
> The records are posts/Submissions (in PRAW). This is evident due to the existence of 'num_comments', 'name', 'locked' & other attributes that are common to posts/Submissions 

In [4]:
one_perct_df.count() #~300+ million

309811130

In [5]:
%%time
one_perct_df.groupby("created_utc_year").count().orderBy("created_utc_year").show()

+----------------+--------+
|created_utc_year|   count|
+----------------+--------+
|            2005|     121|
|            2006|   49806|
|            2007|  295787|
|            2008|  869100|
|            2009| 2267457|
|            2010| 5823105|
|            2011|14809391|
|            2012|31257433|
|            2013|48294467|
|            2014|63861731|
|            2015|80257957|
|            2016|62024775|
+----------------+--------+

CPU times: user 7.22 ms, sys: 9.85 ms, total: 17.1 ms
Wall time: 9.04 s


> This 1% stratified sample is off by ~10%. The actual yearly counts in the sample are greater than the expected yearly counts by an order of magnitude.

# <font color='magenta'>Subreddits

In [28]:
from pyspark.sql.functions import from_unixtime

In [29]:
one_perct_df = one_perct_df.withColumn(
    "created_utc_yearMonthDay", 
    from_unixtime(
        one_perct_df["created_utc"], 
        "yyyy-MM-dd" # full timestamp: yyyy-MM-dd HH:mm:ss.SS
    )
)

In [30]:
from pyspark.sql.functions import length

In [31]:
one_perct_df = one_perct_df.withColumn(
    "body_len",
    length(one_perct_df["body"])
)

In [32]:
%%time
yearsMonthDay_df = one_perct_df.groupby(["subreddit","created_utc_yearMonthDay"]).count()

avg_post_df = yearsMonthDay_df.groupby("subreddit").avg()
avg_post_df.orderBy("avg(count)", ascending = False).show(10)

+---------------+------------------+
|      subreddit|        avg(count)|
+---------------+------------------+
|      AskReddit| 9625.758874320434|
|          funny|2303.9110468850354|
|leagueoflegends|2222.1374468085105|
|           pics|2138.2788033099937|
|  AdviceAnimals|1866.3457497612226|
|     The_Donald|1725.2114882506528|
|            nfl|1411.4748329621382|
|         gaming|1408.0648119841028|
|       politics|1341.2522631261315|
|   pcmasterrace|1336.1035767511178|
+---------------+------------------+
only showing top 10 rows

CPU times: user 21 ms, sys: 3.3 ms, total: 24.3 ms
Wall time: 44.5 s


> Average number of posts/Submissions per Subreddit.

In [33]:
%%time
max_post_df = yearsMonthDay_df.groupby("subreddit").max()
max_post_df.orderBy("max(count)", ascending = False).show(10)

+-----------------+----------+
|        subreddit|max(count)|
+-----------------+----------+
|        AskReddit|     25487|
|         politics|     18517|
|              nfl|     14949|
|           gaming|     13775|
|     pcmasterrace|     13304|
|millionairemakers|     12874|
|             news|     12298|
|              nba|     12160|
|    SquaredCircle|     12033|
|              CFB|     11263|
+-----------------+----------+
only showing top 10 rows

CPU times: user 8.01 ms, sys: 8.35 ms, total: 16.4 ms
Wall time: 17.6 s


In [34]:
these_subreddits = [
    "worldnews", "news", "AskCulinary",
    "AskHistorians", "howto", "todayilearned",
    "conspiracy", "MilitaryConspiracy", "PedoGate",
    "FalseFlagWatch", "skeptic", "politicalfactchecking",
    "MensRights", "MRActivism", "glasgow",
    "melbourne", "travel", "photography"
]

In [49]:
subreddits_df = yearsMonthDay_df.filter(yearsMonthDay_df.subreddit.isin(these_subreddits))

## Calculate IQRs for select Subreddits

In [60]:
import numpy as np

In [99]:
# IQR_arr = np.zeros(shape=[len(these_subreddits), 4])
# IQR_arr = np.empty(shape = [len(these_subreddits),4], dtype = 'U, f, f, f')
# IQR_arr = np.empty(shape = [0,4], dtype = 'U, f, f, f')

In [100]:
# counter = 0
for entry in these_subreddits:
    temp_df = subreddits_df.filter(subreddits_df.subreddit == entry)
    IQR_result = temp_df.approxQuantile(col = "count", probabilities = [0.25, 0.5, 0.75], relativeError = 0)
#         IQR_arr[counter] = entry,IQR_result[0], IQR_result[1], IQR_result[2]
#     np.append(IQR_arr,[entry,IQR_result[0], IQR_result[1], IQR_result[2]])
    print(entry, IQR_result)
#     counter += 1

worldnews [265.0, 758.0, 1844.0]
news [41.0, 165.0, 1348.0]
AskCulinary [22.0, 30.0, 38.0]
AskHistorians [72.0, 91.0, 111.0]
howto [3.0, 6.0, 9.0]
todayilearned [226.0, 1296.0, 1839.0]
conspiracy [29.0, 148.0, 275.0]
MilitaryConspiracy [1.0, 1.0, 1.0]
PedoGate []
FalseFlagWatch [1.0, 1.0, 1.0]
skeptic [18.0, 32.0, 46.0]
politicalfactchecking [1.0, 2.0, 4.0]
MensRights [37.0, 125.0, 180.0]
MRActivism [1.0, 1.0, 1.0]
glasgow [3.0, 7.0, 13.0]
melbourne [15.0, 39.0, 93.0]
travel [25.0, 56.0, 86.0]
photography [37.0, 94.0, 135.0]
