Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
RedditApiStockParser/parse_tickers_from_reddit.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
97 lines (82 sloc)
4.01 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script connects to reddit's API and retrieves a count of stock $ticker mentions to gauge rising popularity. | |
import praw | |
import json | |
import time | |
import datetime | |
from datetime import timezone | |
import re | |
import boto3 | |
import os | |
import pytz | |
import sys | |
# Function to find all occurrences of the stock ticker (or $ticker) with case ignored. Returns the number of occurrences | |
def find_occurrences_of_stock_ticker(arg_ticker, arg_text_to_search): | |
# Regex that also checks for boundaries (start of sentence, end of sentence, etc.) | |
reg_ex_count = sum(1 for match in re.finditer(r"\b{}\b".format(arg_ticker), arg_text_to_search, re.IGNORECASE)) | |
# Also check for ticker with a $ in front of it | |
prefaced_ticker = "$" + arg_ticker | |
reg_ex_count = reg_ex_count + sum(1 for match in re.finditer(r"\b{}\b".format(prefaced_ticker), arg_text_to_search, | |
re.IGNORECASE)) | |
return reg_ex_count | |
# Connection credentials to reddit's API | |
reddit = praw.Reddit( | |
client_id=os.environ['REDDIT_API_CLIENT_ID'], | |
client_secret=os.environ['REDDIT_API_CLIENT_SECRET'], | |
user_agent=os.environ['REDDIT_API_USER_AGENT'] | |
) | |
# Instantiating objects | |
posts_in_last_day = [] | |
text_blob = '' | |
# Retrieve subreddit name from terminal argument | |
subreddit_name = str(sys.argv[1]) | |
# Get all posts from subreddit in the last 24 hours (limit is 900, but no 24 period has reached that number) | |
for post in reddit.subreddit(subreddit_name).new(limit=900): | |
post_title = post.title | |
post_creation_epoch_time = post.created - 60 * 60 * 8 # subtracting 8 hours due to timezone | |
current_epoch_time = int(time.time()) | |
age_of_post_in_hours = (current_epoch_time - post_creation_epoch_time) / 60 / 60 | |
if age_of_post_in_hours < 24: | |
posts_in_last_day.append(post) | |
# Define metrics for posts and comments in the last 24 hours | |
post_count_in_last_day = posts_in_last_day.__len__() | |
comments_in_last_day = 0 | |
# Retrieve all comments from the acquired posts | |
for post in posts_in_last_day: | |
text_blob = text_blob + post.title | |
post.comments.replace_more(limit=1) | |
for comment in post.comments.list(): | |
if comment.body: | |
comments_in_last_day = comments_in_last_day + 1 | |
text_blob = text_blob + comment.body | |
# The text_blob is an amalgamation of all posts and comments from the last 24 hours | |
# We're going to parse it and find occurrences of stock names | |
dictionary = {} | |
with open("curated_stock_tickers.txt") as f: | |
for line in f: | |
line = line.rstrip('\n') | |
print("Currently counting: " + str(line)) | |
occurrences = find_occurrences_of_stock_ticker(line, text_blob) | |
if occurrences > 0: | |
dictionary[line] = occurrences | |
# Get the current time and format it accordingly | |
current_time = datetime.datetime.now(timezone.utc) | |
est = pytz.timezone('US/Eastern') | |
date_format = "%d %B %I:%M %p" | |
# Write out the data in .json format for consumption by the frontend | |
json_data = {"posts": post_count_in_last_day, "comments": comments_in_last_day, | |
"time": current_time.astimezone(est).strftime(date_format), | |
"data": (sorted(dictionary.items(), key=lambda x: x[1], reverse=True))} | |
fp = open(subreddit_name + '_most_mentioned_stocks.json', 'w+') | |
fp.write(json.dumps(json_data)) | |
fp.close() | |
# Open connection to AWS S3 bucket | |
s3 = boto3.resource('s3', | |
aws_access_key_id=os.environ['S3_KEY'], | |
aws_secret_access_key=os.environ['S3_SECRET']) | |
s3_client = boto3.client('s3', | |
aws_access_key_id=os.environ['S3_KEY'], | |
aws_secret_access_key=os.environ['S3_SECRET']) | |
# Upload the .json file to S3. Making it public so anyone can use it. | |
s3_client.upload_file(subreddit_name + '_most_mentioned_stocks.json', 'wsb-pop-index', | |
subreddit_name + 'PopIndex.json', ExtraArgs={'ContentType': "application/json", | |
'ACL': 'public-read'}) |