In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pymongo import MongoClient
import json
import pprint
import praw

import pprint
import bs4 as bs
import urllib.request
import bz2,shutil

import boto3
import os
import time
import subprocess

In [3]:
def get_download_links():
    "Gets download links to all reddit comments by month"
    
    url_source = 'https://files.pushshift.io/reddit/comments/'

    source = urllib.request.urlopen(url_source).read()
    soup = bs.BeautifulSoup(source,'lxml')
    table = soup.table

    url_dict = {'month': [], 'link': []}
    for i in table.find_all('tr', class_='file'):
        rel_url = i.find('a').text
        if rel_url[:2] == 'RC':
            dot = rel_url.find('.')
            url_dict['month'].append(rel_url[3:dot])
            url_dict['link'].append(url_source + rel_url)
    
    links_df = pd.DataFrame(url_dict)
    links_df['downloaded'] = False
    links_df['size_in_bytes'] = 0
    links_df['comment_count'] = 0
    
    return links_df

In [28]:
# Get download links from website and save to csv

# links_df = get_download_links()
# links_df.to_csv('comment_files/links_dataframe.csv', index=False)

In [36]:
# Retrieve download links

links_df = pd.read_csv('comment_files/links_dataframe.csv')
links_df.head(15)

Unnamed: 0,month,link,downloaded,size_in_bytes,comment_count
0,2005-12,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
1,2006-01,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
2,2006-02,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
3,2006-03,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
4,2006-04,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
5,2006-05,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
6,2006-06,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
7,2006-07,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
8,2006-08,https://files.pushshift.io/reddit/comments/RC_...,False,0,0
9,2006-09,https://files.pushshift.io/reddit/comments/RC_...,False,0,0


In [30]:
def download_file(download_url, s3_bucket = None, remove_file=True):
    filename = download_url[download_url.rfind('/') + 1:]
    filepath = './comment_files/' + filename
    print('Downloading file...')
    
    if s3_bucket is None:
        urllib.request.urlretrieve(download_url, filepath)
    else:
        s3 = boto3.client('s3')
        s3.download_file(s3_bucket, filename, filepath)

    return extract_file(filepath, remove_file)

In [31]:
def extract_file(fp, remove_file):
    print('Extracting file...')
    fileout = fp[:fp.rfind('.')] + '.json'
    extension = fp[fp.rfind('.'):]

    if extension == '.bz2':
        with bz2.BZ2File(fp) as fr, open(fileout, 'wb') as fw:
            shutil.copyfileobj(fr,fw)
#     elif extension == '.xz':
#         pass
    else:
        raise Exception('Cannot decompress files of type {}'.format(extension))

    if remove_file:
        print('Removing file...')
        try:
            os.remove(fp)
        except:
            print("Error while deleting file ", fp)

    filesize = os.path.getsize(fileout)
    return fileout, filesize

In [32]:
def mongo_import(month, fp):
    db_name = 'reddit'
    collection_name = 'comments-{}'.format(month)
    cmd = ['mongoimport', '-d', db_name, '-c', collection_name, '--file', fp]
    print('Loading to mongodb...')
    subprocess.run(cmd, check=True, text=True)
    
    try:
        os.remove(fp)
    except:
        print("Error while deleting file ", fp)
    
    return {'db_name': db_name, 'collection_name': collection_name, 'filepath': fp, 'month': month}

In [33]:
def filter_comments(mongoinfo):
    client = MongoClient()
    db_name, collection_name = mongoinfo['db_name'], mongoinfo['collection_name']
    db = client[db_name]
    comments = db[collection_name]
    comment_count = comments.count()

    cursor = comments.find({'subreddit': 
                                {'$in': ['politics', 'sports', 'worldnews', 'The_Donald']}, 
                            '$expr': 
                                {'$eq': ['$link_id', '$parent_id']}}, 
                           {'_id': 1, 'author': 1, 'body': 1, 'created_utc': 1, 'id': 1, 'link_id': 1, 
                                'parent_id': 1, 'score': 1, 'subreddit': 1})
    
    my_db = client['myreddit']
    my_comments = my_db[collection_name]
    
    print('Saving relevant comments...')
    for doc in cursor:
        my_comments.insert_one(doc)

    print('{} comments saved.'.format(my_comments.count()))
    print('Dropping comments source collection...')
    comments.drop()
    client.close()

    return comment_count

In [34]:
def get_posts(praw_reddit, month):
    client = MongoClient()
    collection_name = 'comments-{}'.format(month)
    db = client['myreddit']
    comments = db[collection_name]
    post_ids = comments.distinct('link_id')
    distinct_posts = len(post_ids)
    print('There are {} distinct posts for the month'.format(distinct_posts))

    collection_name = 'posts-{}'.format(month)
    posts = db[collection_name]

    praw_generator = praw_reddit.info(post_ids)
    print('Getting post data...')
    for submission in praw_generator:
        d = {'link_id': submission.id, 
             'title': submission.title, 
             'score': submission.score, 
             'is_self': submission.is_self, 
             'datetime': submission.created_utc, 
             'sub': submission.subreddit.display_name, 
             'permalink': submission.permalink}
        posts.insert_one(d)
    
    client.close()

In [None]:
with open('../keys/reddit_appid.txt') as f:
    APP_ID = f.read().rstrip()
with open('../keys/reddit_secret.txt') as f:
    APP_SECRET = f.read().rstrip()
reddit = praw.Reddit(client_id=APP_ID,
                     client_secret=APP_SECRET,
                     user_agent='script:my.project:v1.0.0 (by /u/Someone')


# for idx, row in links_df[links_df['downloaded'] == False].iloc[12:13].iterrows():
for idx, row in links_df.iloc[10:50].iterrows():
    start = time.time()
    month = row['month']
    url = row['link']
    print('Reddit comments month: {}'.format(month))

    filename = url[url.rfind('/') + 1:]
    s3_url = 's3://aust-galv-aust-finalcap/' + filename
    fp, filesize = download_file(s3_url, s3_bucket='aust-galv-aust-finalcap')
    mongoinfo = mongo_import(month, fp)

    comment_count = filter_comments(mongoinfo)
    get_posts(reddit, month)
    
    links_df.iloc[idx, 2] = True
    links_df.iloc[idx, 3] = filesize
    links_df.iloc[idx, 4] = comment_count
    links_df.to_csv('comment_files/links_dataframe.csv', index=False)

    print('Done! Time elapsed: {:1.2f}'.format(time.time() - start))
    print('')

Reddit comments month: 2006-10
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 3.54

Reddit comments month: 2006-11
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 4.02

Reddit comments month: 2006-12
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 3.95

Reddit comments month: 2007-01
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 5.18

Reddit comments month: 2007-02
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 6.17

Reddit comments month: 2007-03
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 6.62

Reddit comments month: 2007-04
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 16.19

Reddit comments month: 2007-05
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 9.42

Reddit comments month: 2007-06
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 9.67

Reddit comments month: 2007-07
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...
0 comments saved.
Dropping comments source collection...
There are 0 distinct posts for the month
Getting post data...
Done! Time elapsed: 10.93

Reddit comments month: 2007-08
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




6711 comments saved.
Dropping comments source collection...
There are 1802 distinct posts for the month
Getting post data...
Done! Time elapsed: 25.52

Reddit comments month: 2007-09
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




11987 comments saved.
Dropping comments source collection...
There are 2945 distinct posts for the month
Getting post data...
Done! Time elapsed: 35.11

Reddit comments month: 2007-10
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




19588 comments saved.
Dropping comments source collection...
There are 5047 distinct posts for the month
Getting post data...
Done! Time elapsed: 52.98

Reddit comments month: 2007-11
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




34909 comments saved.
Dropping comments source collection...
There are 8351 distinct posts for the month
Getting post data...
Done! Time elapsed: 81.71

Reddit comments month: 2007-12
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




38973 comments saved.
Dropping comments source collection...
There are 7884 distinct posts for the month
Getting post data...
Done! Time elapsed: 81.16

Reddit comments month: 2008-01
Downloading file...
Extracting file...
Removing file...
Loading to mongodb...


  


Saving relevant comments...




51442 comments saved.
Dropping comments source collection...
There are 10514 distinct posts for the month
Getting post data...
Done! Time elapsed: 104.27

Reddit comments month: 2008-02
Downloading file...
Extracting file...


In [26]:
# client = MongoClient()
# db = client['myreddit']
# comments = db['comments-{}'.format(month)]

# links = ['t3_7mofk']

# praw_generator = reddit.info(links)
# for submission in praw_generator:
#     d = {'link_id': submission.id, 
#          'title': submission.title, 
#          'score': submission.score, 
#          'is_self': submission.is_self, 
#          'datetime': submission.created_utc, 
#          'sub': submission.subreddit.display_name, 
#          'permalink': submission.permalink}
#     pprint.pprint(d)

{'datetime': 1230747395.0,
 'is_self': False,
 'link_id': '7mofk',
 'permalink': '/r/politics/comments/7mofk/the_day_israel_used_a_boy_aged_13_as_a_human/',
 'score': 86,
 'sub': 'politics',
 'title': 'The day Israel used a boy aged 13 as a human shield '}


In [13]:
links_df.iloc[43:44, :]

Unnamed: 0,month,link,downloaded,size_in_bytes,comment_count
43,2009-07,https://files.pushshift.io/reddit/comments/RC_...,False,0,0


In [27]:
mongoinfo = mongo_import('2006-01', './comment_files/RC_2006-01.json')

client = MongoClient()
temp_db = client['reddit']
temp_col = temp_db[mongoinfo['collection_name']]

temp_col.count()

Loading to mongodb...


  import sys


3666

In [28]:
temp_col.find_one()

{'_id': ObjectId('5e542549d70d406b412f6299'),
 'link_id': 't3_22515',
 'stickied': False,
 'subreddit_id': 't5_6',
 'controversiality': 0,
 'body': "Microsoft hates it's own products?\r\nWho knew?",
 'author_flair_css_class': None,
 'created_utc': 1136078623,
 'ups': 2,
 'score': 2,
 'subreddit': 'reddit.com',
 'id': 'c2718',
 'parent_id': 't3_22515',
 'edited': False,
 'author_flair_text': None,
 'author': 'Pichu0102',
 'retrieved_on': 1473821517,
 'distinguished': None,
 'gilded': 0}

In [37]:
client = MongoClient()
db = client['myreddit']
comments = db['comments-2011-08']
comments.count()

  after removing the cwd from sys.path.


128764

In [71]:
get_posts(reddit, month)

There are 16462 distinct posts for the month
Getting post data...


In [72]:
posts.find_one()

{'_id': ObjectId('5e5419474d3ce5d53043c5e0'),
 'link_id': 'fnf5q',
 'title': 'UW Milwaukee protests- Not as intense as Madison, but we do what we can.',
 'score': 12,
 'is_self': False,
 'datetime': 1297974010.0,
 'sub': 'politics',
 'permalink': '/r/politics/comments/fnf5q/uw_milwaukee_protests_not_as_intense_as_madison/'}