# Get Reddit submissions data

We'll use [psaw](https://psaw.readthedocs.io/en/latest/) Python wrapper for [Pushshift API](https://github.com/pushshift/api) to query [AskDocs subreddit](https://www.reddit.com/r/AskDocs/) for submissions posted in 2017 through Jan 21st 2022.

In [None]:
import datetime as dt
import pickle

In [None]:
# for chime alerts for query success/fails
import chime

In [None]:
chime.theme('zelda')

In [None]:
from psaw import PushshiftAPI

In [None]:
api = PushshiftAPI()

In [None]:
start_year = 2017
end_year = 2022

The data we want to query is pretty big and it will take some time to pull all of it. Let's query for 2017 first, because it is the smaller data chunk by year, and we can use it to do some EDA while the rest of the querying finishes.

In [None]:
d = {}

Note: not sure whether the end_epoch is inclusive or exclusive in Pushshift API, so we'll assume it's exclusive here and check for dups in EDA.

In [None]:
%%time

year = 2017

start_epoch=int(dt.datetime(year, 1, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())
end_epoch=int(dt.datetime(year+1, 1, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())

gen = api.search_submissions(
    subreddit='askdocs', 
    after=start_epoch,
    before=end_epoch
)

d[year] = [thing.d_ for thing in gen]

print(f'=== Finished ====')

In [None]:
len(d[2017])

In [None]:
fname = 'reddit_askdocs_submissions_2017.pkl'

with open(fname, 'wb') as outfile:
    pickle.dump(d[2017], outfile)

Now let's query for the remaining years of the data sample.

In [None]:
%%time

for year in range(start_year+1, end_year+1):
    print(f'\n\n=== Collecting data for year {year} ====')
    chime.info()
    d[year] = []

    # Getting a lot of connection errors when querying by year, so querying by month instead
    for month in range(1, 13):
        if month < 12:
            start_epoch=int(dt.datetime(year, month, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())
            end_epoch=int(dt.datetime(year, month+1, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())
        else:
            start_epoch=int(dt.datetime(year, 12, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())
            end_epoch=int(dt.datetime(year+1, 1, 1, 0, 0, 0, 0, tzinfo=dt.timezone.utc).timestamp())

        got_the_data = False
        n_tries = 0
        while not got_the_data:
            n_tries += 1
            print(f'-- Attempt #{n_tries} to get the data for {year}-{month:02}.')
            try: 
                gen = api.search_submissions(
                    subreddit='askdocs', 
                    after=start_epoch,
                    before=end_epoch
                )

                data_ym = [thing.d_ for thing in gen]
                
                got_the_data = True
                print(f'-- Got the data for {year}-{month:02} after {n_tries} attempts.')
                chime.success()
                
            except Exception as e:
                chime.error()
                print(e)
        
        d[year].extend(data_ym)

    print(f'=== Finished {year} ====')

In [None]:
d.keys()

In [None]:
fname = 'reddit_askdocs_submissions_2018_to_20220121.pkl'

with open(fname, 'wb') as outfile:
    pickle.dump(d, outfile)