In [135]:
import pandas as pd           
import praw                   
import re                     
import datetime as dt
import seaborn as sns
import requests
import json
import sys
import time
## acknowledgements
'''
https://stackoverflow.com/questions/48358837/pulling-reddit-comments-using-python-praw-and-creating-a-dataframe-with-the-resu
https://www.reddit.com/r/redditdev/comments/2e2q2l/praw_downvote_count_always_zero/
https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

For navigating pushshift: https://github.com/Watchful1/Sketchpad/blob/master/postDownloader.py
'''

'\nhttps://stackoverflow.com/questions/48358837/pulling-reddit-comments-using-python-praw-and-creating-a-dataframe-with-the-resu\nhttps://www.reddit.com/r/redditdev/comments/2e2q2l/praw_downvote_count_always_zero/\nhttps://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91\n'

In [4]:
with open("API.env") as file:
    exec(file.read())

reddit = praw.Reddit(
  client_id = client_id,
  client_secret = client_secret,
  user_agent = user_agent
)

Version 7.0.0 of praw is outdated. Version 7.2.0 was released Wednesday February 24, 2021.


In [255]:
'''
Some helper functions for the reddit API.
'''

def extract_num_rewards(awardings_data):
    return sum( x["count"] for x in awardings_data)

def extract_data(submission, comments = True):
    postlist = []

    # extracts top level comments
    if comments:
        submission.comments.replace_more(limit=0)
        for comment in submission.comments: 
            post = {} # put this here
            post['author'] = comment.author
            post['body'] = comment.body
            post['awards'] = comment.all_awardings
            post['score'] = comment.score
            post['parent_id'] = comment.parent_id
            post['id'] = comment.id

            postlist.append(post)

    content = {
    "title" : submission.title,
    "self" : submission.is_self,
    "text" : submission.selftext,
    "comments" : postlist,
    "author" : submission.author,
    "name" : submission.name,
    "upvote_ratio" : submission.upvote_ratio,
    "ups" : submission.score, #this is the same as submission.ups,
    "downs" : None,
    "awarders" : submission.awarders, 
    "awards" : submission.all_awardings,
    "total_awards" : None,
    "url" : submission.url # Only relevent if not a self post
    }
    
    content["total_awards"] = extract_num_rewards(content["awards"])
    content["downs"] = (1 - content["upvote_ratio"]) * content["ups"] / content["upvote_ratio"]
    return content

In [44]:
'''
Sample num_samples random submissions, and get the top num_samples submissions, and put them into dataframes.
'''

def random_sample(num_samples, subreddit):
    sample = []
    for i in range(num_samples):
        submission = reddit.subreddit(subreddit).random() 
        sample.append(extract_data(submission))
    return(pd.DataFrame(sample))

def sample(source):
    submissions = []
    for submission in source:
        submissions.append(extract_data(submission))
    print(f"Got {len(submissions)} submissions. (This can be less than num_samples.)")
    return(pd.DataFrame(submissions))

def top_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).top(limit=num_samples) )

def rising_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).rising(limit=num_samples))

def controversial_sample(num_samples, subreddit):
    return sample(reddit.subreddit(subreddit).controversial(limit=num_samples) )


num_samples = 10
subreddit ='wallstreetbets'

random_wsb = random_sample(num_samples, subreddit)
top_wsb = top_sample(num_samples,subreddit)
rising_wsb = rising_sample(num_samples, subreddit)
controversial_wsb = controversial_sample(num_samples, subreddit)
# other commands here: https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.rising
# NB: The subreddit stream option seems useful.
# NB: There is also rising_random



Got 10 submissions. (This can be less than num_samples.)
Got 10 submissions. (This can be less than num_samples.)
Got 10 submissions. (This can be less than num_samples.)


In [45]:
random_wsb.to_pickle("random_wsb.pkl")
top_wsb.to_pickle("top_wsb.pkl")
rising_wsb.to_pickle("rising_wsb.pkl")
controversial_wsb.to_pickle("controversial_wsb.pkl")

In [307]:
'''
Code for getting all submissions between certain date time
'''


def get_all_submissions(start_time, end_time):
    end = end_time
    df = pd.DataFrame()
    while end > start_time:
        time.sleep(1) # Requests are rate limited
        print(f"Target time: {start_time}, current end point {end}, remaining {end - start_time}")

        url = f"https://api.pushshift.io/reddit/submission/search/?after={start_time}&before={end}&sort_type=created_utc&sort=desc&subreddit=wallstreetbets&limit=1000"

        data = requests.get(url)

        data_json = data.json()
        if len(data_json['data']) == 0:
            # break if there is no returned data
            break

        temp_df = pd.DataFrame(data_json['data'])
        end = min(temp_df.created_utc) 
        df = df.append(temp_df, ignore_index = True)

    return df


start = dt.datetime(2021, 5, 6)
start_time = int(start.replace(tzinfo=dt.timezone.utc).timestamp())

end = dt.datetime(2021, 5, 7) #dt.datetime.now()
end_time = int(end.replace(tzinfo=dt.timezone.utc).timestamp())
print(f"time is currently {now_timestamp}")

df = get_all_submissions(start_time, end_time)

# Get the current score from praw
print(f"Got {len(df)} submissions.")
print("Getting the updated values.")

scores = []
total_awards_received = []

# Based on this: https://www.reddit.com/r/redditdev/comments/aoe4pk/praw_getting_multiple_submissions_using_by_id/
ids2 = [i if i.startswith('t3_') else f't3_{i}' for i in list(df.id)]
for submission in reddit.info(ids2): # Makes a single call to the PRAW API, much faster than doing them one by one.
    scores.append(submission.score)
    total_awards_received.append(extract_num_rewards(submission.all_awardings))

df['score'], df['total_awards_received'] = scores, total_awards_received

wsb_cleaned = df[['title', 'id', 'created_utc', 'author_fullname', 'is_self', 'score', 'upvote_ratio', 'total_awards_received', 'selftext']]

time is currently 1620330846
Target time: 1620259200, current end point 1620345600, remaining 86400
Target time: 1620259200, current end point 1620341606, remaining 82406
Target time: 1620259200, current end point 1620338080, remaining 78880
Target time: 1620259200, current end point 1620334037, remaining 74837
Target time: 1620259200, current end point 1620331474, remaining 72274
Target time: 1620259200, current end point 1620328796, remaining 69596
Target time: 1620259200, current end point 1620326097, remaining 66897
Target time: 1620259200, current end point 1620323077, remaining 63877
Target time: 1620259200, current end point 1620321017, remaining 61817
Target time: 1620259200, current end point 1620318313, remaining 59113
Target time: 1620259200, current end point 1620315568, remaining 56368
Target time: 1620259200, current end point 1620312887, remaining 53687
Target time: 1620259200, current end point 1620310667, remaining 51467
Target time: 1620259200, current end point 16203

In [308]:
wsb_cleaned.sort_values(by = "score", ascending = False).head(10)

Unnamed: 0,title,id,created_utc,author_fullname,is_self,score,upvote_ratio,total_awards_received,selftext
2493,HOLDGME,n5vldf,1620263260,t2_5syqfl4h,False,59823,1.0,199,
1719,The Insurrection of the Apes,n63q7n,1620294990,t2_a2jxbzsv,False,40149,1.0,251,
514,When will the dip dip???,n6f0w5,1620328286,t2_5ulprski,False,36801,1.0,108,
1071,We’ve been compromised!,n69ajb,1620313307,t2_11cvrlt5,False,29714,1.0,164,
467,No FOMO,n6fig8,1620329606,t2_462jbyck,False,14293,1.0,96,
2044,Got my first letter from Fidelity asking if I ...,n605gl,1620279374,t2_110ph2,False,13096,1.0,88,
725,"US Congress on today's GameStop hearing: ""We'v...",n6csms,1620322543,t2_e0s1you,False,7973,1.0,34,
1854,WSB mods sleeping... AMC &amp; GME to the mooo...,n62fi3,1620289149,t2_a0qxqff0,True,6171,1.0,45,AMC and GME let's go.
2173,BRING GME BACK!,n5z4si,1620275340,t2_4b5sdbo4,True,5823,0.5,76,"Enough with the new pump and dump every week, ..."
956,"100% concentration, all in GME! Final purchase...",n6am1n,1620316800,t2_ah5h0y1b,False,5055,1.0,28,


In [305]:



list(reddit.info(ids2))

[Submission(id='n6lg1w'),
 Submission(id='n6lfdp'),
 Submission(id='n6lfa5'),
 Submission(id='n6lefj'),
 Submission(id='n6lecl'),
 Submission(id='n6ldw9'),
 Submission(id='n6lckw'),
 Submission(id='n6lcfq'),
 Submission(id='n6lc76'),
 Submission(id='n6lc4g'),
 Submission(id='n6lc0t'),
 Submission(id='n6lbqn'),
 Submission(id='n6lb1x'),
 Submission(id='n6lb1o'),
 Submission(id='n6lav4'),
 Submission(id='n6l9l3'),
 Submission(id='n6l84d'),
 Submission(id='n6l7lp'),
 Submission(id='n6l7ex'),
 Submission(id='n6l72x'),
 Submission(id='n6l72o'),
 Submission(id='n6l6nx'),
 Submission(id='n6l5j3'),
 Submission(id='n6l56i'),
 Submission(id='n6l492'),
 Submission(id='n6l3od'),
 Submission(id='n6l3eg'),
 Submission(id='n6l2mq'),
 Submission(id='n6l2dz'),
 Submission(id='n6l16e'),
 Submission(id='n6l134'),
 Submission(id='n6l0ph'),
 Submission(id='n6l0ln'),
 Submission(id='n6l0fd'),
 Submission(id='n6kzz3'),
 Submission(id='n6kyyl'),
 Submission(id='n6kyuy'),
 Submission(id='n6kyum'),
 Submission(

In [306]:
scores = []
total_awards_received = []

ids2 = [i if i.startswith('t3_') else f't3_{i}' for i in list(df.id)]
for submission in reddit.info(ids2):
    #submission = reddit.submission(id = submission_id)
    scores.append(submission.score)
    total_awards_received.append(extract_num_rewards(submission.all_awardings))
    print(submission.score)

1
1
1
5
34
0
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
1
111
1
0
16
3
1
1
38
1
2
1
1
1
1
2
1
1
8
1
1
1
1
1
1
1
1
23
9
1
2
1
1
1
13
1
1
0
1
1
1
1
1
1
1
0
2
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
0
0
1
1
1
2
16
1
1
1
17
14
1
1
1
2
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
8
1
21
1
1
1
12
1
1
1
1
1
1
1
1
2
0
1
1
8
1
836
1
1
2
1
1
87
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
2
1
1
1
1
1
1
0
1
0
2
1
1
1
1
1
1
13
1
1
0
1
1
48
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
8
19
1
1
172
1
1
1
1
1
1
46
2
1
86
1
1
1
1
19
1
1
1
0
7
3
1
1
1
1
1
2
1
1
1
1
0
1
14
1
12
1
1
1
1
1
1
1
1
1
14
3
1742
1
1
17
1
1
1
211
1
1
1
0
1
102
0
1
1
1
1
1
8
1
1
1
0
2
1
2
1
2
1
1
2
1
1
0
2
1
25
1
1
4
1
26
1
0
143
1
1
1
1
1
1
1
1
46
0
1
1
1
5
0
1
1
2
1
1
1
1
1
1
79
1
1
1
1
183
1
1
4
1
1
87
1
1
1
123
12
2
1
1
1
1
1
1
1
2
8
1
1
1
3
1
1
30
1
1
1
2
1
1
22
1
1
1
1
1
5
1
1
1
1
1
1
1
4
1
2
1
1
1
2
2
43
1
1
1
1
1
1
1
0
1
12
84
2
1
1
1
1
1
1
1
49
143
24
1
0
1
1
1
1
1
1
3
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
0
1
928
1
8
1
1
1
1
1
2
1
0
1
0
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
14227
1
1

In [297]:
list(df.id)

['n6lg1w',
 'n6lfdp',
 'n6lfa5',
 'n6lefj',
 'n6lecl',
 'n6ldw9',
 'n6lckw',
 'n6lcfq',
 'n6lc76',
 'n6lc4g',
 'n6lc0t',
 'n6lbqn',
 'n6lb1x',
 'n6lb1o',
 'n6lav4',
 'n6l9l3',
 'n6l84d',
 'n6l7lp',
 'n6l7ex',
 'n6l72x',
 'n6l72o',
 'n6l6nx',
 'n6l5j3',
 'n6l56i',
 'n6l492',
 'n6l3od',
 'n6l3eg',
 'n6l2mq',
 'n6l2dz',
 'n6l16e',
 'n6l134',
 'n6l0ph',
 'n6l0ln',
 'n6l0fd',
 'n6kzz3',
 'n6kyyl',
 'n6kyuy',
 'n6kyum',
 'n6kyrg',
 'n6kyfy',
 'n6ky9r',
 'n6kxo9',
 'n6kxmf',
 'n6kx34',
 'n6kwx5',
 'n6kwae',
 'n6kvr9',
 'n6kvm1',
 'n6ktxj',
 'n6ks7d',
 'n6krq9',
 'n6kre5',
 'n6krbw',
 'n6krbk',
 'n6kqqj',
 'n6kpbv',
 'n6ko7g',
 'n6ko74',
 'n6kmqy',
 'n6kmkd',
 'n6km1y',
 'n6klfi',
 'n6kkez',
 'n6kkdn',
 'n6kk97',
 'n6kj6b',
 'n6kiu4',
 'n6khoq',
 'n6khmi',
 'n6kgwc',
 'n6kgqs',
 'n6kg0q',
 'n6kflu',
 'n6kfk5',
 'n6kfi2',
 'n6kfab',
 'n6kf45',
 'n6kepf',
 'n6keik',
 'n6kebi',
 'n6kdf7',
 'n6kd3j',
 'n6kd0u',
 'n6kcte',
 'n6kcm9',
 'n6kc9e',
 'n6kagz',
 'n6k9y8',
 'n6k9wd',
 'n6k9r9',
 'n6k8vt',

In [241]:
#sns.lmplot( data = wsb_cleaned, x = "total_awards_received", y = "score")
wsb_cleaned[wsb_cleaned.title.str.contains("dip dip")]

Unnamed: 0,title,id,created_utc,author_fullname,is_self,score,upvote_ratio,total_awards_received,selftext
514,When will the dip dip???,n6f0w5,1620328286,t2_5ulprski,False,1,1.0,0,
518,I thought the dip dipped...,n6ezzd,1620328218,t2_5ulprski,False,1,1.0,0,


In [13]:
# traffic = reddit.subreddit(subreddit).traffic() is not available to us, sadly.