# Reddit Data retrieval

### Authenticating with reddit

In [46]:
import yaml
from praw import Reddit

In [47]:
with open('auth.yaml') as yaml_f:
    auth_obj = yaml.full_load(yaml_f)

In [48]:
reddit = Reddit(
    username=auth_obj['username'],
    password=auth_obj['password'],
    client_id=auth_obj['client-id'],
    client_secret=auth_obj['client-secret'],
    user_agent='aita_prediction_bot'
)

In [49]:
reddit.user.me()

Redditor(name='aita_prediction_bot')

### Use pushift.io to retrieve posts

In [53]:
from datetime import datetime, timedelta


def datetime_tuple_range(start=None, end=None):
    span = end - start
    for i in range(span.days + 1):
        yield (start + timedelta(days=i), start + timedelta(days=i + 1))

date_tuples = list(datetime_tuple_range(start=datetime(2018, 8, 23), end=datetime(2020, 8, 23)))
date_tuples[:5]

[(datetime.datetime(2018, 8, 23, 0, 0), datetime.datetime(2018, 8, 24, 0, 0)),
 (datetime.datetime(2018, 8, 24, 0, 0), datetime.datetime(2018, 8, 25, 0, 0)),
 (datetime.datetime(2018, 8, 25, 0, 0), datetime.datetime(2018, 8, 26, 0, 0)),
 (datetime.datetime(2018, 8, 26, 0, 0), datetime.datetime(2018, 8, 27, 0, 0)),
 (datetime.datetime(2018, 8, 27, 0, 0), datetime.datetime(2018, 8, 28, 0, 0))]

In [59]:
import time
import requests
from tqdm import tqdm


submissions = []

for time_after, time_before in tqdm(date_tuples):

    request_params = {
        'before': int(time.mktime(time_before.timetuple())),
        'after': int(time.mktime(time_after.timetuple())),
        'subreddit': 'AmITheAsshole'
    }
    url = 'https://api.pushshift.io/reddit/submission/search/'

    response = requests.get(url, params=request_params)
    if response:
        submissions.append(response.json()['data'])
    else:
        print('request failed')
    time.sleep(10)

100%|██████████| 732/732 [2:16:12<00:00, 11.16s/it]  


In [60]:
len(response.json()['data'])

25

In [61]:
import json

with open('../data/submissions.json', 'w') as json_f:
    json.dump(
        {
            'submissions': submissions
        },
        json_f
    )

In [62]:
len(submissions)

732

In [65]:
sum([len(subs) for subs in submissions])

18289