# Main executeable

Results saved to results/

## Imports & API connection test & init

In [1]:
import praw                             #Reddit API
import nltk                             #natural language
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import yfinance as yf                   #yahoo finance

import sys  
import os                               #to import authentification
sys.path.append(os.path.abspath('docs')) 
from auth import *                      #authentification details - create an auth.py file in code/ with authentification details

import matplotlib.pyplot as plt
import math
import datetime as dt
import pandas as pd
import numpy as np
import requests         
from bs4 import BeautifulSoup as bs     #web scraper
import prawcore

reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT)

print(reddit.read_only)


True


## Get resources
Scrape updated list of tickers in S&P500 

In [2]:
#NOTE: this also takes a while (~1min)

import re

#Subreddits to parse:
SUBREDDITS = ["wallstreetbets", "investing", "trading", "stocks", "stockmarket"]


def get_spy():
    url = 'https://www.slickcharts.com/sp500'
    request = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(request.text, "lxml")

    stats = soup.find('table',class_='table table-hover table-borderless table-sm')

    df = pd.read_html(str(stats))[0]
    #FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.

    df['% Chg'] = df['% Chg'].str.strip('()-%')

    df['% Chg'] = pd.to_numeric(df['% Chg'])

    df['Chg'] = pd.to_numeric(df['Chg'])

    #df["Company"] = df["Company"].str.strip(" ")

    return df



business_suffixes = ["Corp", "Corporation", "Inc", "LLC", "Limited", "Ltd", "Inc.", "Class A", "Class B", "Class C", ".", ","]
pattern = re.compile(rf'\s*(?:{"|".join(business_suffixes)})(?:[.,]?)\s*$', re.IGNORECASE)

def clean_company_name(name):
    #removes common business suffixes
    cleaned_name = pattern.sub('', name).strip()
    return cleaned_name





#Get SP tickers as strings
df = get_spy()
#sp_tickers = df["Symbol"]   #list of tickers in sp500 as strings
sp_tickers = ['AMD', 'NVDA', 'AMC', 'SPY', 'TSLA', 'SMCI', 'AAPL', 'PYPL', 'DTE', 'JPM', 'SBUX', 'MSTR', 'GME', 'MSFT', 'LLY', 'QQQ', 'META', 'SNAP', 'INTC', 'MCD', 'GOOGL', 'GOOG', 'TSM', 'VOO', 'NFLX', 'PLTR', 'KO', 'RTX', 'LMT', 'IBM', 'BA']

stocks = {}
for ticker in sp_tickers:
    stocks[ticker] = []
    
    stock = yf.Ticker(ticker)
    stocks[ticker].append(stock) #0 is stock obj
    stocks[ticker].append(ticker) #search value1
    
    name = clean_company_name(stock.info.get("shortName", ""))
    
    if name != "":
        stocks[ticker].append(name)


#SAMPLE:
for i in range(0, 10):
    values = stocks[sp_tickers[i]]
    print(values[1:])


    



  df = pd.read_html(str(stats))[0]


['AMD', 'Advanced Micro Devices,']
['NVDA', 'NVIDIA']
['AMC', 'AMC Entertainment Holdings,']
['SPY', 'SPDR S&P 50']
['TSLA', 'Tesla,']
['SMCI', 'Super Micro Computer,']
['AAPL', 'Apple']
['PYPL', 'PayPal Holdings,']
['DTE', 'DTE Energy Compan']
['JPM', 'JP Morgan Chase & C']


## Functions

In [3]:
def postSentiment(urlT):
    try:
        post = reddit.submission(url=urlT)
        pbody = post.selftext
        #print(post.title)
    except:
        return 0
    

    sia = SIA()
    body_sentiments = sia.polarity_scores(pbody)
    
    title_sentiment = sia.polarity_scores(post.title)

    avg_sentiment = (body_sentiments['compound']+title_sentiment['compound'])/2
    
    '''print(avg_sentiment)
    
    for key in title_sentiment.keys():
        print(key, title_sentiment[key])
    
    for key in body_sentiments.keys():
        print(key, body_sentiments[key])'''
    
    return avg_sentiment

postSentiment('https://www.reddit.com/r/wallstreetbets/comments/1cbrwwz/goog_the_guy_who_killed_yahoo_search_is_now/')

def commentSentiment(urlT):
    #given post that mentions ticker, will calculate average sentiment of comments to that post and will return median comment date
    

    comments = [] 
    bodyComment = []
    comment_dates = []

    result = [0, -1]

    #get comments from sub
    try:
        post = reddit.submission(url=urlT)
        comments = post.comments            #returns iterable CommentForest object
    except:
        return result
    
    #save each comment to array
    for comment in comments:
        try: 
            bodyComment.append(comment.body)
            comment_dates.append(comment.created_utc)
        except:
            return result
    
    #median comments
    n = len(comment_dates)
    try:
        comment_dates.sort()
        mid = n // 2
        median = comment_dates[mid]
    except:
        return result
        
    sia = SIA()
    results = []
    for line in bodyComment:
        scores = sia.polarity_scores(line)
        scores['headline'] = line

        results.append(scores)
    
    df = pd.DataFrame.from_records(results)
    df.head()
    df['label'] = 0
    
    try:
        df.loc[df['compound'] > 0.1, 'label'] = 1
        df.loc[df['compound'] < -0.1, 'label'] = -1
    except:
        return result
    
    averageScore = 0
    position = 0
    while position < len(df.label)-1:
        averageScore = averageScore + df.label[position]
        position += 1

    averageScore = averageScore/len(df.label)

    
    result[0] = (averageScore)
    result[1] = median

    return result


date_format = "%Y-%m-%d"

def format_date(unix):
    #given date in epoch format, convert to YYYY-MM-DD format
    date = dt.datetime.fromtimestamp(unix)
    return date.strftime(date_format)


def add_days(date: str, t: int) -> str:
    date_obj = dt.datetime.strptime(date, date_format).date()

    new_date = date_obj + dt.timedelta(days=t) # Add t days

    return new_date.strftime(date_format)


def check_adjust_day(date):
    date_obj = dt.datetime.strptime(date, date_format).date()

    day = date_obj.weekday()

    if day == 5: #if the date is a saturday make it a friday
        date = add_days(date, -1)

    elif day == 6: #if date is a sunday make it monday
        date = add_days(date, 1)
    
    return date


def get_xday_ret(stock, date, x: int) -> float:
    date1 = add_days(date, 1)
    datex = check_adjust_day(add_days(date, x)) #x: 1,3,5,7,10
    

    counter = 0
    while True:
        stock_t0 = stock.history(period='1d', start=date, end=date1)

        if not stock_t0.empty or counter == 10:
            break

        date = date1
        date1 = add_days(date, 1)
        counter += 1
    
    counter = 0
    date1 = add_days(datex, 1)

    if (datex == date): ##if og date got incremented too much, then push date3 by one day
        datex = date1
        date1 = add_days(datex, 1)

    while True:
        stock_t1 = stock.history(period='1d', start=datex, end=date1)

        if not stock_t1.empty or counter == 10:
            break

        datex = date1
        date1 = add_days(datex, 1)
        counter += 1

    close0 = stock_t0["Close"].iloc[0]
    close1 = stock_t1["Close"].iloc[0]

    
    return round(close1/close0 -1, 4)

def filter_date(range, date) -> bool:
    #returns true if given submission date is within the time range
    
    #Convert dates to dt objects
    start = dt.datetime.strptime(range[0], "%Y-%m-%d").date()
    end = dt.datetime.strptime(range[1], "%Y-%m-%d").date()
    date_obj = dt.datetime.strptime(date, "%Y-%m-%d").date()

    return start <= date_obj <= end



spy = yf.Ticker("SPY")

date="2024-04-05"

print(get_xday_ret(spy, date, 3))



0.0006


## Computations

### Version 3
Iterate through each ticker in SP500, go through each subreddit analyzing each post that mentions the ticker. Measure the sentiment of the text in the post itself (if it exists) and calculate an average of sentiment of the comments in response to this post.

This cell produces a set of .csv files for each ticker with the following columns of data: 

Date (post), Sentiment of post, comment sentiment average, score, upvote ratio, number of crossposts, domain, intraday return, next day return, next to 5th trading day return, next to 10th trading return, next to 20th trading day return.

In [7]:
##VERSION 3

# NOTE: Running this takes a lot of time. (1 day for 500 tickers takes ~1hr)
submission_statistics = []
search_range = ["2024-05-21", "2024-05-21"] #1day range

counter = 1
try:
    for ticker in sp_tickers:
        n = len(sp_tickers)
        progress = round(counter/n, 4) * 100
        print(f"Progress: {progress}% - Ticker: {counter} out of {n} tickers.")
        
        d = {}
        query = f"(title:{stocks[ticker][1]} OR selftext:{stocks[ticker][1]})"  # OR selftext:{stocks[ticker][2]} OR title:{stocks[ticker][2]})"

        for subname in SUBREDDITS:
            for submission in reddit.subreddit(subname).search(query, syntax='lucene', limit=200, time_filter="year"):
                #Date of submission  
                sub_date = format_date(submission.created_utc)  
            
                if not filter_date(search_range, sub_date):     #skip post if not in date range
                    continue
                
                d = {}
                d['ticker'] = ticker
                d['date'] = sub_date
                                
                #Sentiment analysis:    
                d['post_sentiment'] = postSentiment(submission.url)
                commentinfo = commentSentiment(submission.url)
                d['comment_sentiment_average'] = commentinfo[0]
                if d['comment_sentiment_average'] == 0:
                    continue
                
                #Popularity factors:
                d['num_comments'] = submission.num_comments
                d['score'] = submission.score
                d['upvote_ratio'] = submission.upvote_ratio
                d['num_crossposts'] = submission.num_crossposts
                if commentinfo[1] != -1:
                    d['delta'] = (commentinfo[1] - submission.created_utc)/86400         #difference in submission post date and median comment in days
                else:
                    continue

                #Returns
                '''t0 = check_adjust_day(sub_date) #adjust date if weekend
                d['3day'] = get_xday_ret(stocks[ticker][0], t0, 3)
                d['5day'] = get_xday_ret(stocks[ticker][0], t0, 5)
                d['7day'] = get_xday_ret(stocks[ticker][0], t0, 7)
                d['10day'] = get_xday_ret(stocks[ticker][0], t0, 10)
                d['12day'] = get_xday_ret(stocks[ticker][0], t0, 12)'''

                #extra info
                d['domain'] = submission.domain
                d['url'] = submission.url
                submission_statistics.append(d)
        
        counter += 1
except prawcore.exceptions.TooManyRequests as e:
        print("too many requests broski.....\n\n\n\n")    

finally:
    dfSentimentStocks = pd.DataFrame(submission_statistics)
    print(df)

    #dfSentimentStocks.sort_values('comment_sentiment_average', axis = 0, ascending = True, inplace = True, na_position ='last') 

    dfSentimentStocks.to_csv(f"../results/OMYA{search_range[0]}_SA.csv", index=False) 

Progress: 3.2300000000000004% - Ticker: 1 out of 31 tickers.
Progress: 6.45% - Ticker: 2 out of 31 tickers.
Progress: 9.68% - Ticker: 3 out of 31 tickers.
Progress: 12.9% - Ticker: 4 out of 31 tickers.
Progress: 16.13% - Ticker: 5 out of 31 tickers.
Progress: 19.35% - Ticker: 6 out of 31 tickers.
Progress: 22.58% - Ticker: 7 out of 31 tickers.
Progress: 25.81% - Ticker: 8 out of 31 tickers.
Progress: 29.03% - Ticker: 9 out of 31 tickers.
Progress: 32.26% - Ticker: 10 out of 31 tickers.
Progress: 35.480000000000004% - Ticker: 11 out of 31 tickers.
Progress: 38.71% - Ticker: 12 out of 31 tickers.
Progress: 41.94% - Ticker: 13 out of 31 tickers.
Progress: 45.16% - Ticker: 14 out of 31 tickers.
Progress: 48.39% - Ticker: 15 out of 31 tickers.
Progress: 51.61% - Ticker: 16 out of 31 tickers.
Progress: 54.84% - Ticker: 17 out of 31 tickers.
Progress: 58.06% - Ticker: 18 out of 31 tickers.
Progress: 61.29% - Ticker: 19 out of 31 tickers.
Progress: 64.52% - Ticker: 20 out of 31 tickers.
Progre