# Reddit Data retrieval

### Authenticating with reddit

In [1]:
import yaml
from praw import Reddit

In [2]:
with open('auth.yaml') as yaml_f:
    auth_obj = yaml.full_load(yaml_f)

In [3]:
reddit = Reddit(
    username=auth_obj['username'],
    password=auth_obj['password'],
    client_id=auth_obj['client-id'],
    client_secret=auth_obj['client-secret'],
    user_agent='aita_prediction_bot'
)

KeyError: 'client-id'

In [4]:
reddit.user.me()

Redditor(name='aita_prediction_bot')

### Use pushift.io to retrieve posts

In [4]:
from datetime import datetime, timedelta


def datetime_tuple_range(start=None, end=None):
    span = end - start
    for i in range(span.days + 1):
        yield (start + timedelta(days=i), start + timedelta(days=i + 1))

date_tuples = list(datetime_tuple_range(start=datetime(2016, 8, 23), end=datetime(2020, 8, 23)))
date_tuples[:5]

[(datetime.datetime(2016, 8, 23, 0, 0), datetime.datetime(2016, 8, 24, 0, 0)),
 (datetime.datetime(2016, 8, 24, 0, 0), datetime.datetime(2016, 8, 25, 0, 0)),
 (datetime.datetime(2016, 8, 25, 0, 0), datetime.datetime(2016, 8, 26, 0, 0)),
 (datetime.datetime(2016, 8, 26, 0, 0), datetime.datetime(2016, 8, 27, 0, 0)),
 (datetime.datetime(2016, 8, 27, 0, 0), datetime.datetime(2016, 8, 28, 0, 0))]

In [17]:
import time
import requests
from tqdm import tqdm


submissions = []
failed = []

for time_after, time_before in tqdm(date_tuples):

    request_params = {
        'before': int(time.mktime(time_before.timetuple())),
        'after': int(time.mktime(time_after.timetuple())),
        'subreddit': 'AmITheAsshole',
        'size': 500
    }
    url = 'https://api.pushshift.io/reddit/submission/search/'

    response = requests.get(url, params=request_params)
    if response:
        submissions.extend(response.json()['data'])
    else:
        failed.append((time_after, time_before))
    time.sleep(3) # Avoid rate limiting

 17%|█▋        | 255/1462 [16:17<1:17:08,  3.83s/it]


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
import json

with open('../data/submissions-2016-2020-500-per-day.json', 'w') as json_f:
    json.dump(
        {
            'submissions': submissions
        },
        json_f
    )

In [None]:
len(submissions)

In [None]:
import random

index = random.randint(0, len(submissions))

In [None]:
submissions[index]

In [None]:
import time
import requests
from tqdm import tqdm


for time_after, time_before in tqdm(failed):

    request_params = {
        'before': int(time.mktime(time_before.timetuple())),
        'after': int(time.mktime(time_after.timetuple())),
        'subreddit': 'AmITheAsshole'
    }
    url = 'https://api.pushshift.io/reddit/submission/search/'

    response = requests.get(url, params=request_params)
    if response:
        submissions.extend(response.json()['data'])
    else:
        print('request failed')
    time.sleep(4) # Avoid rate limiting

In [None]:
with open('../data/submissions-2016-2020-500-per-day-with-failed.json', 'w') as json_f:
    json.dump(
        {
            'submissions': submissions
        },
        json_f
    )

In [15]:
len(submissions)

11955

In [16]:
len(failed)

1