In [4]:
import pandas as pd           
import praw                   
import re                     
import datetime as dt
import seaborn as sns
import requests
import json
import sys
import time
## acknowledgements
'''
https://stackoverflow.com/questions/48358837/pulling-reddit-comments-using-python-praw-and-creating-a-dataframe-with-the-resu
https://www.reddit.com/r/redditdev/comments/2e2q2l/praw_downvote_count_always_zero/
https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

For navigating pushshift: https://github.com/Watchful1/Sketchpad/blob/master/postDownloader.py

# traffic = reddit.subreddit(subreddit).traffic() is not available to us, sadly.
'''

with open("../API.env") as file:
    exec(file.read())

reddit = praw.Reddit(
  client_id = client_id,
  client_secret = client_secret,
  user_agent = user_agent
)


'''
Some helper functions for the reddit API.
'''

def extract_num_rewards(awardings_data):
    return sum( x["count"] for x in awardings_data)

def extract_data(submission, comments = False):
    postlist = []

    # extracts top level comments

    if comments:
        submission.comments.replace_more(limit=0)
        for comment in submission.comments: 
            post = {} # put this here
            post['author'] = comment.author
            post['body'] = comment.body
            post['awards'] = comment.all_awardings
            post['score'] = comment.score
            post['parent_id'] = comment.parent_id
            post['id'] = comment.id

            postlist.append(post)

    content = {
    "title" : submission.title,
    "self" : submission.is_self,
    "text" : submission.selftext,
    "comments" : postlist,
    "author" : submission.author,
    "name" : submission.name,
    "time_created" : submission.created_utc,
    "upvote_ratio" : submission.upvote_ratio,
    "ups" : submission.score, #this is the same as submission.ups,
    "downs" : None,
    "awarders" : submission.awarders, 
    "awards" : submission.all_awardings,
    "total_awards" : None,
    "url" : submission.url # Only relevant if not a self post
    }
    
    content["total_awards"] = extract_num_rewards(content["awards"])
    return content

In [2]:
'''
Sample num_samples random submissions, and get the top num_samples submissions, and put them into dataframes.

Opted instead to scrape the entire thing.
'''

def random_sample(num_samples, subreddit):
    sample = []
    for i in range(num_samples):
        submission = reddit.subreddit(subreddit).random() 
        sample.append(extract_data(submission))
    return(pd.DataFrame(sample))

def sample(source):
    submissions = []
    for submission in source:
        submissions.append(extract_data(submission))
    print(f"Got {len(submissions)} submissions. (This can be less than num_samples.)")
    return(pd.DataFrame(submissions))

def top_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).top(limit=num_samples) )

def rising_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).rising(limit=num_samples))

def controversial_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).controversial(limit=num_samples) )


In [5]:

num_samples = 10
subreddit ='wallstreetbets'



#random_wsb = random_sample(num_samples, subreddit)
#top_wsb = top_sample(num_samples,subreddit)
#rising_wsb = rising_sample(num_samples, subreddit)
#controversial_wsb = controversial_sample(num_samples, subreddit)

#random_wsb.to_pickle("random_wsb.pkl")
#top_wsb.to_pickle("top_wsb.pkl")
#rising_wsb.to_pickle("rising_wsb.pkl")
#controversial_wsb.to_pickle("controversial_wsb.pkl")

# other commands here: https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.rising
# NB: The subreddit stream option seems useful.
# NB: There is also rising_random


Got 10 submissions. (This can be less than num_samples.)


In [6]:
submission = reddit.subreddit(subreddit).random() 

In [8]:
submission.approved_at_utc

In [11]:
vars(submission)

{'comment_limit': 2048,
 'comment_sort': 'confidence',
 'id': 'n79brp',
 '_reddit': <praw.reddit.Reddit at 0x209b5dbb280>,
 '_fetched': True,
 '_comments_by_id': {'t1_gxbmyyu': Comment(id='gxbmyyu'),
  't1_gxbwl4d': Comment(id='gxbwl4d'),
  't1_gxcgbf4': Comment(id='gxcgbf4'),
  't1_gxcl2le': Comment(id='gxcl2le'),
  't1_gxd1p7e': Comment(id='gxd1p7e')},
 'approved_at_utc': None,
 'subreddit': Subreddit(display_name='wallstreetbets'),
 'selftext': '',
 'user_reports': [],
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': '$3.4K gain from SPY calls, paper handed these but diamond handed some puts expiring on Monday 🤡 🤡🤡',
 'link_flair_richtext': [{'e': 'text', 't': 'Gain'}],
 'subreddit_name_prefixed': 'r/wallstreetbets',
 'hidden': False,
 'pwls': 7,
 'link_flair_css_class': 'profit',
 'downs': 0,
 'thumbnail_height': 55,
 'top_awarded_type': None,
 'parent_whitelist_status': 'some_ads',
 'hide_score': False,
 'name': 't3_n79brp',
 'quarantine': Fal

In [15]:
str(submission.flair)

'<praw.models.reddit.submission.SubmissionFlair object at 0x00000209BACB3580>'