# Reddit Data retrieval

### Authenticating with reddit

In [1]:
import yaml
from praw import Reddit

In [2]:
with open('auth.yaml') as yaml_f:
    auth_obj = yaml.full_load(yaml_f)

In [3]:
reddit = Reddit(
    username=auth_obj['username'],
    password=auth_obj['password'],
    client_id=auth_obj['client-id'],
    client_secret=auth_obj['client-secret'],
    user_agent='aita_prediction_bot'
)

In [4]:
reddit.user.me()

Redditor(name='aita_prediction_bot')

### Use pushift.io to retrieve posts

In [5]:
from datetime import datetime, timedelta


def datetime_tuple_range(start=None, end=None):
    span = end - start
    for i in range(span.days + 1):
        yield (start + timedelta(days=i), start + timedelta(days=i + 1))

date_tuples = list(datetime_tuple_range(start=datetime(2016, 8, 23), end=datetime(2020, 8, 23)))
date_tuples[:5]

[(datetime.datetime(2016, 8, 23, 0, 0), datetime.datetime(2016, 8, 24, 0, 0)),
 (datetime.datetime(2016, 8, 24, 0, 0), datetime.datetime(2016, 8, 25, 0, 0)),
 (datetime.datetime(2016, 8, 25, 0, 0), datetime.datetime(2016, 8, 26, 0, 0)),
 (datetime.datetime(2016, 8, 26, 0, 0), datetime.datetime(2016, 8, 27, 0, 0)),
 (datetime.datetime(2016, 8, 27, 0, 0), datetime.datetime(2016, 8, 28, 0, 0))]

In [22]:
import time
import requests
from tqdm import tqdm


submissions = []

for time_after, time_before in tqdm(date_tuples):

    request_params = {
        'before': int(time.mktime(time_before.timetuple())),
        'after': int(time.mktime(time_after.timetuple())),
        'subreddit': 'AmITheAsshole'
    }
    url = 'https://api.pushshift.io/reddit/submission/search/'

    response = requests.get(url, params=request_params)
    if response:
        submissions.extend(response.json()['data'])
    else:
        print('request failed')
    time.sleep(5) # Avoid rate limiting

  2%|▏         | 2/110 [00:13<12:05,  6.72s/it]


KeyboardInterrupt: 

In [18]:
import json

with open('../data/submissions-2016-2020.json', 'w') as json_f:
    json.dump(
        {
            'submissions': submissions
        },
        json_f
    )

In [19]:
len(submissions)

22864

In [20]:
import random

index = random.randint(0, len(submissions))

In [21]:
submissions[index]

{'author': 'kirbykazzkarina',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_13f02zmd',
 'author_patreon_flair': False,
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1540422257,
 'domain': 'self.AmItheAsshole',
 'full_link': 'https://www.reddit.com/r/AmItheAsshole/comments/9r4pjq/aita_because_i_cant_control_my_emotions_and_my/',
 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0},
 'id': '9r4pjq',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 17,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status': 'house_only',
 'permalink': '/r

In [28]:
from praw.models import Submission

submission = Submission(reddit, url=submissions[index]['url'])

In [31]:
submission.link_flair_text

'Not the A-hole'

In [24]:
### The last few requests failed so lets retry them

import time
import requests
from tqdm import tqdm

with open('../data/submissions-2016-2020.json') as json_f:
    submissions = json.load(json_f)['submissions']

for time_after, time_before in tqdm(date_tuples[1352:]):

    request_params = {
        'before': int(time.mktime(time_before.timetuple())),
        'after': int(time.mktime(time_after.timetuple())),
        'subreddit': 'AmITheAsshole'
    }
    url = 'https://api.pushshift.io/reddit/submission/search/'

    response = requests.get(url, params=request_params)
    if response:
        submissions.extend(response.json()['data'])
    else:
        print('request failed')
    time.sleep(4) # Avoid rate limiting

100%|██████████| 110/110 [09:11<00:00,  5.01s/it]


In [25]:
with open('../data/submissions-2016-2020-with-failed.json', 'w') as json_f:
    json.dump(
        {
            'submissions': submissions
        },
        json_f
    )

In [26]:
len(submissions)

25614