# Main executeable

Results saved to results/

## Imports & API connection test & init

In [2]:
import praw                             #Reddit API
import nltk                             #natural language
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import yfinance as yf                   #yahoo finance
from auth import *                      #authentification details - create an auth.py file in code/ with authentification details
import matplotlib.pyplot as plt
import math
import datetime as dt
import pandas as pd
import numpy as np
import requests         
from bs4 import BeautifulSoup as bs     #web scraper

reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT)

print(reddit.read_only)

True


## Get resources
Scrape updated list of tickers in S&P500 

In [3]:
#NOTE: this also takes a while (~1min)

import re

#Subreddits to parse:
SUBREDDITS = ["wallstreetbets", "investing", "trading", "stocks", "stockmarket"]


def get_spy():
    url = 'https://www.slickcharts.com/sp500'
    request = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(request.text, "lxml")

    stats = soup.find('table',class_='table table-hover table-borderless table-sm')

    df = pd.read_html(str(stats))[0]
    #FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.

    df['% Chg'] = df['% Chg'].str.strip('()-%')

    df['% Chg'] = pd.to_numeric(df['% Chg'])

    df['Chg'] = pd.to_numeric(df['Chg'])

    #df["Company"] = df["Company"].str.strip(" ")

    return df



business_suffixes = ["Corp", "Corporation", "Inc", "LLC", "Limited", "Ltd", "Inc.", "Class A", "Class B", "Class C", ".", ","]
pattern = re.compile(rf'\s*(?:{"|".join(business_suffixes)})(?:[.,]?)\s*$', re.IGNORECASE)

def clean_company_name(name):
    #removes common business suffixes
    cleaned_name = pattern.sub('', name).strip()
    return cleaned_name





#Get SP tickers as strings
df = get_spy()
sp_tickers = df["Symbol"]   #list of tickers in sp500 as strings

stocks = {}
for ticker in sp_tickers:
    stocks[ticker] = []
    
    stock = yf.Ticker(ticker)
    stocks[ticker].append(stock) #0 is stock obj
    stocks[ticker].append(ticker) #search value1
    
    name = clean_company_name(stock.info.get("shortName", ""))
    
    if name != "":
        stocks[ticker].append(name)


#SAMPLE:
for i in range(0, 10):
    values = stocks[sp_tickers[i]]
    print(values[1:])


    



  df = pd.read_html(str(stats))[0]


['MSFT', 'Microsoft']
['AAPL', 'Apple']
['NVDA', 'NVIDIA']
['AMZN', 'Amazon.com,']
['META', 'Meta Platforms,']
['GOOGL', 'Alphabet']
['GOOG', 'Alphabet']
['BRK.B']
['LLY', 'Eli Lilly and Compan']
['AVGO', 'Broadcom']


## Functions

In [4]:
def postSentiment(urlT):
    try:
        post = reddit.submission(url=urlT)
        pbody = post.selftext
        #print(post.title)
    except:
        return 0
    

    sia = SIA()
    body_sentiments = sia.polarity_scores(pbody)
    
    title_sentiment = sia.polarity_scores(post.title)

    avg_sentiment = (body_sentiments['compound']+title_sentiment['compound'])/2
    
    '''print(avg_sentiment)
    
    for key in title_sentiment.keys():
        print(key, title_sentiment[key])
    
    for key in body_sentiments.keys():
        print(key, body_sentiments[key])'''
    
    return avg_sentiment

postSentiment('https://www.reddit.com/r/wallstreetbets/comments/1cbrwwz/goog_the_guy_who_killed_yahoo_search_is_now/')

def commentSentiment(urlT):
    #given post that mentions ticker, will calculate average sentiment of comments to that post and will return median comment date
    

    comments = [] 
    bodyComment = []
    comment_dates = []

    result = [0, -1]

    #get comments from sub
    try:
        post = reddit.submission(url=urlT)
        comments = post.comments            #returns iterable CommentForest object
    except:
        return result
    
    #save each comment to array
    for comment in comments:
        try: 
            bodyComment.append(comment.body)
            comment_dates.append(comment.created_utc)
        except:
            return result
    
    #median comments
    n = len(comment_dates)
    try:
        comment_dates.sort()
        mid = n // 2
        median = comment_dates[mid]
    except:
        return result
        
    sia = SIA()
    results = []
    for line in bodyComment:
        scores = sia.polarity_scores(line)
        scores['headline'] = line

        results.append(scores)
    
    df = pd.DataFrame.from_records(results)
    df.head()
    df['label'] = 0
    
    try:
        df.loc[df['compound'] > 0.1, 'label'] = 1
        df.loc[df['compound'] < -0.1, 'label'] = -1
    except:
        return result
    
    averageScore = 0
    position = 0
    while position < len(df.label)-1:
        averageScore = averageScore + df.label[position]
        position += 1

    averageScore = averageScore/len(df.label)

    
    result[0] = (averageScore)
    result[1] = median

    return result


date_format = "%Y-%m-%d"

def format_date(unix):
    #given date in epoch format, convert to YYYY-MM-DD format
    date = dt.datetime.fromtimestamp(unix)
    return date.strftime(date_format)


def add_days(date: str, t: int) -> str:
    date_obj = dt.datetime.strptime(date, date_format).date()

    new_date = date_obj + dt.timedelta(days=t) # Add t days

    return new_date.strftime(date_format)


def check_adjust_day(date):
    date_obj = dt.datetime.strptime(date, date_format).date()

    day = date_obj.weekday()

    if day == 5: #if the date is a saturday make it a friday
        date = add_days(date, -1)

    elif day == 6: #if date is a sunday make it monday
        date = add_days(date, 1)
    
    return date


def get_xday_ret(stock, date, x: int) -> float:
    date1 = add_days(date, 1)
    datex = check_adjust_day(add_days(date, x)) #x: 1,3,5,7,10
    

    while True:
        stock_t0 = stock.history(period='1d', start=date, end=date1)

        if not stock_t0.empty:
            break

        date = date1
        date1 = add_days(date, 1)
    
    date1 = add_days(datex, 1)

    if (datex == date): ##if og date got incremented too much, then push date3 by one day
        datex = date1
        date1 = add_days(datex, 1)

    while True:
        stock_t1 = stock.history(period='1d', start=datex, end=date1)

        if not stock_t1.empty:
            break

        datex = date1
        date1 = add_days(datex, 1)

    close0 = stock_t0["Close"].iloc[0]
    close1 = stock_t1["Close"].iloc[0]

    
    return round(close1/close0 -1, 4)

def filter_date(range, date) -> bool:
    #returns true if given submission date is within the time range
    
    #Convert dates to dt objects
    start = dt.datetime.strptime(range[0], "%Y-%m-%d").date()
    end = dt.datetime.strptime(range[1], "%Y-%m-%d").date()
    date_obj = dt.datetime.strptime(date, "%Y-%m-%d").date()

    return start <= date_obj <= end



spy = yf.Ticker("SPY")

date="2024-04-05"

print(get_xday_ret(spy, date, 3))



0.0006


## Computations

### Version 3
Iterate through each ticker in SP500, go through each subreddit analyzing each post that mentions the ticker. Measure the sentiment of the text in the post itself (if it exists) and calculate an average of sentiment of the comments in response to this post.

This cell produces a set of .csv files for each ticker with the following columns of data: 

Date (post), Sentiment of post, comment sentiment average, score, upvote ratio, number of crossposts, domain, intraday return, next day return, next to 5th trading day return, next to 10th trading return, next to 20th trading day return.

In [60]:
##VERSION 3

# NOTE: Running this takes a lot of time. (1 day for 500 tickers takes ~1hr)
submission_statistics = []
search_range = ["2023-05-05", "2023-06-02"] #1day range

counter = 1
for ticker in sp_tickers:
    progress = round(counter/503, 2) * 100
    print(f"Progress: {progress}% - Ticker: {counter}")
    
    d = {}
    query = f"(title:{stocks[ticker][1]} OR selftext:{stocks[ticker][1]})"  # OR selftext:{stocks[ticker][2]} OR title:{stocks[ticker][2]})"

    for subname in SUBREDDITS:
        for submission in reddit.subreddit(subname).search(query, syntax='lucene', limit=200, time_filter='year'):
            #Date of submission  
            sub_date = format_date(submission.created_utc)  
        
            if not filter_date(search_range, sub_date):     #skip post if not in date range
                continue
            
            d = {}
            d['ticker'] = ticker
            d['date'] = sub_date
                             
            #Sentiment analysis:    
            d['post_sentiment'] = postSentiment(submission.url)
            commentinfo = commentSentiment(submission.url)
            d['comment_sentiment_average'] = commentinfo[0]
            if d['comment_sentiment_average'] == 0:
                continue
            
            #Popularity factors:
            d['num_comments'] = submission.num_comments
            d['score'] = submission.score
            d['upvote_ratio'] = submission.upvote_ratio
            d['num_crossposts'] = submission.num_crossposts
            if commentinfo[1] != -1:
                d['delta'] = (commentinfo[1] - submission.created_utc)/86400         #difference in submission post date and median comment in days
            else:
                continue

            #Returns
            t0 = check_adjust_day(sub_date) #adjust date if weekend
            d['3day'] = get_xday_ret(stocks[ticker][0], t0, 3)
            d['5day'] = get_xday_ret(stocks[ticker][0], t0, 5)
            d['7day'] = get_xday_ret(stocks[ticker][0], t0, 7)
            d['10day'] = get_xday_ret(stocks[ticker][0], t0, 10)
            d['12day'] = get_xday_ret(stocks[ticker][0], t0, 12)

            #extra info
            d['domain'] = submission.domain
            d['url'] = submission.url
            submission_statistics.append(d)
        
dfSentimentStocks = pd.DataFrame(submission_statistics)

dfSentimentStocks.sort_values('comment_sentiment_average', axis = 0, ascending = True, inplace = True, na_position ='last') 

dfSentimentStocks.to_csv(f"../results/{search_range[0]}_SA.csv", index=False) 

0.49405
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.023
neu 0.817
pos 0.16
compound 0.9881
0.34015
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.03
neu 0.891
pos 0.079
compound 0.6803
-0.23259999999999997
neg 0.0
neu 0.878
pos 0.122
compound 0.2023
neg 0.067
neu 0.907
pos 0.025
compound -0.6675
-0.22685
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.062
neu 0.883
pos 0.054
compound -0.4537
-0.23259999999999997
neg 0.0
neu 0.878
pos 0.122
compound 0.2023
neg 0.067
neu 0.907
pos 0.025
compound -0.6675
0.4789
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.025
neu 0.868
pos 0.108
compound 0.9578
-0.3753
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.084
neu 0.866
pos 0.051
compound -0.7506
-0.46155
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.091
neu 0.882
pos 0.026
compound -0.9231
-0.32575
neg 0.0
neu 0.77
pos 0.23
compound 0.2716
neg 0.091
neu 0.882
pos 0.026
compound -0.9231
0.34015
neg 0.0
neu 1.0
pos 0.0
compound 0.0
neg 0.03
neu 0.891
pos 0.079
compound 0.6803
-0.23259999999999997
neg 0.0
neu 0.878
p