-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_tickers_from_reddit.py
97 lines (82 loc) · 4.01 KB
/
parse_tickers_from_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# This script connects to reddit's API and retrieves a count of stock $ticker mentions to gauge rising popularity.
import praw
import json
import time
import datetime
from datetime import timezone
import re
import boto3
import os
import pytz
import sys
# Function to find all occurrences of the stock ticker (or $ticker) with case ignored. Returns the number of occurrences
def find_occurrences_of_stock_ticker(arg_ticker, arg_text_to_search):
# Regex that also checks for boundaries (start of sentence, end of sentence, etc.)
reg_ex_count = sum(1 for match in re.finditer(r"\b{}\b".format(arg_ticker), arg_text_to_search, re.IGNORECASE))
# Also check for ticker with a $ in front of it
prefaced_ticker = "$" + arg_ticker
reg_ex_count = reg_ex_count + sum(1 for match in re.finditer(r"\b{}\b".format(prefaced_ticker), arg_text_to_search,
re.IGNORECASE))
return reg_ex_count
# Connection credentials to reddit's API
reddit = praw.Reddit(
client_id=os.environ['REDDIT_API_CLIENT_ID'],
client_secret=os.environ['REDDIT_API_CLIENT_SECRET'],
user_agent=os.environ['REDDIT_API_USER_AGENT']
)
# Instantiating objects
posts_in_last_day = []
text_blob = ''
# Retrieve subreddit name from terminal argument
subreddit_name = str(sys.argv[1])
# Get all posts from subreddit in the last 24 hours (limit is 900, but no 24 period has reached that number)
for post in reddit.subreddit(subreddit_name).new(limit=900):
post_title = post.title
post_creation_epoch_time = post.created - 60 * 60 * 8 # subtracting 8 hours due to timezone
current_epoch_time = int(time.time())
age_of_post_in_hours = (current_epoch_time - post_creation_epoch_time) / 60 / 60
if age_of_post_in_hours < 24:
posts_in_last_day.append(post)
# Define metrics for posts and comments in the last 24 hours
post_count_in_last_day = posts_in_last_day.__len__()
comments_in_last_day = 0
# Retrieve all comments from the acquired posts
for post in posts_in_last_day:
text_blob = text_blob + post.title
post.comments.replace_more(limit=1)
for comment in post.comments.list():
if comment.body:
comments_in_last_day = comments_in_last_day + 1
text_blob = text_blob + comment.body
# The text_blob is an amalgamation of all posts and comments from the last 24 hours
# We're going to parse it and find occurrences of stock names
dictionary = {}
with open("curated_stock_tickers.txt") as f:
for line in f:
line = line.rstrip('\n')
print("Currently counting: " + str(line))
occurrences = find_occurrences_of_stock_ticker(line, text_blob)
if occurrences > 0:
dictionary[line] = occurrences
# Get the current time and format it accordingly
current_time = datetime.datetime.now(timezone.utc)
est = pytz.timezone('US/Eastern')
date_format = "%d %B %I:%M %p"
# Write out the data in .json format for consumption by the frontend
json_data = {"posts": post_count_in_last_day, "comments": comments_in_last_day,
"time": current_time.astimezone(est).strftime(date_format),
"data": (sorted(dictionary.items(), key=lambda x: x[1], reverse=True))}
fp = open(subreddit_name + '_most_mentioned_stocks.json', 'w+')
fp.write(json.dumps(json_data))
fp.close()
# Open connection to AWS S3 bucket
s3 = boto3.resource('s3',
aws_access_key_id=os.environ['S3_KEY'],
aws_secret_access_key=os.environ['S3_SECRET'])
s3_client = boto3.client('s3',
aws_access_key_id=os.environ['S3_KEY'],
aws_secret_access_key=os.environ['S3_SECRET'])
# Upload the .json file to S3. Making it public so anyone can use it.
s3_client.upload_file(subreddit_name + '_most_mentioned_stocks.json', 'wsb-pop-index',
subreddit_name + 'PopIndex.json', ExtraArgs={'ContentType': "application/json",
'ACL': 'public-read'})