In [231]:
#A notebook that gets daily closing prices, calculates log returns, alpha, beta, and Sharpe Ratio
#TODO: Scrape earnings reports so we have some more graphs to work with
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from datetime import datetime
import twint
%matplotlib inline 

In [232]:
def createDataFrame(stockList, startDate, endDate):
    stockDataFrame = pd.DataFrame(columns = stockList)
    for stock in stockList:
        stockDataFrame[stock] = yf.download(stock, start=startDate, end=endDate, progress=False)['Close']
    return stockDataFrame

def logReturns(stockDataFrame):
    for stock in stockDataFrame.columns:
        stockDataFrame[stock] = np.log(stockDataFrame[stock]) - np.log(stockDataFrame[stock].shift(1))
    stockDataFrame.dropna(inplace=True)
    return stockDataFrame

#We use rolling alpha, beta in this case. Comparing with the SP500 for linear regression
def marketAlphaBeta(logReturnDF, benchmarkDF):
    alphaDataFrame = pd.DataFrame(columns = logReturnDF.columns, index=logReturnDF.index)
    betaDataFrame = pd.DataFrame(columns = logReturnDF.columns, index=logReturnDF.index)
    obs = logReturnDF.shape[0]
    lagWindow = 30
    for i in range((obs-lagWindow)):
        for stock in logReturnDF.columns:
            regressor = LinearRegression()
            regressor.fit(benchmarkDF['^GSPC'].to_numpy()[i : i +lagWindow+1].reshape(-1,1), logReturnDF[stock].to_numpy()[i : i +lagWindow+1])
            betaDataFrame[stock][i+lagWindow]  = regressor.coef_[0]
            alphaDataFrame[stock][i+lagWindow]  = regressor.intercept_
    alphaDataFrame.dropna(inplace=True)
    betaDataFrame.dropna(inplace=True)
    return alphaDataFrame, betaDataFrame

#We use rolling Sharpe ratio in this case. We use 10 year Treasury Note (^TNX) yield as "risk-free" rate
def rollingSharpeRatio(logReturnDF, logBenchmark):
    sharpeDataFrame = pd.DataFrame(columns = logReturnDF.columns, index = logReturnDF.index)
    obs = logReturnDF.shape[0]
    lagWindow = 60
    for i in range((obs-lagWindow)):
        for stock in logReturnDF.columns:
            netReturn = logReturnDF[stock][i : i +lagWindow+1].mean() - logBenchmark['^TNX'][i : i +lagWindow+1].mean()
            stdDev = logReturnDF[stock][i : i +lagWindow+1].std()
            sharpeDataFrame[stock][i+lagWindow]  = netReturn/stdDev
    sharpeDataFrame.dropna(inplace=True)
    return sharpeDataFrame

In [233]:
#Variables that we can modify to get our data
memeStocks = ['AAPL','GOOG','TSLA','KO','OXY','BAC']
benchmarks = ['^GSPC','^TNX']
startDate = '2022-3-24'
endDate = '2023-3-24'

In [234]:
#Run once variables are filled out
newFrame = createDataFrame(memeStocks, startDate, endDate)
newFrame.to_csv("stockPrices.csv")
benchmarkFrame = createDataFrame(benchmarks, startDate, endDate)
benchmarkFrame.to_csv("benchmarkPrices.csv")
logDataFrame = logReturns(newFrame)
logDataFrame.to_csv("logReturnsStock.csv")
logBenchmark = logReturns(benchmarkFrame)
logBenchmark.to_csv("logReturnsBenchmark.csv")
alphaFrame, betaFrame = marketAlphaBeta(logDataFrame, logBenchmark)
alphaFrame.to_csv("alphas.csv")
betaFrame.to_csv("betas.csv")
sharpeDataFrame = rollingSharpeRatio(logDataFrame, logBenchmark)
sharpeDataFrame.to_csv("sharpeRatios.csv")
newFrame = createDataFrame(memeStocks, startDate, endDate)

In [235]:
df = pd.merge(newFrame, benchmarkFrame, left_index=True, right_index=True)
df['Date'] = df.index
df = pd.melt(df, id_vars=[i for i in df.columns.values if i not in memeStocks], value_vars=memeStocks, var_name='Ticker', value_name='Price')
logDataFrame['Date'] = logDataFrame.index
log_melted = pd.melt(logDataFrame, id_vars=['Date'],value_vars=memeStocks, var_name='Ticker', value_name='Price')
df = pd.merge(df, log_melted, on=['Ticker','Date'], how='inner', suffixes=['_Stock','_Log_Return'])

In [236]:
import snscrape.modules.twitter as sntwitter
def scrapeForDate(ticker,dates):
    df = pd.DataFrame()
    since = processDateRange(dates.shift(freq='-1D'))
    until = processDateRange(dates)
    for j,day in enumerate(since):
        tweets_list = []
        query_str = f'${ticker} lang:en since:{since[j]} until:{until[j]}'
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query_str).get_items()):
            if i>=60:
                break
            tweets_list.append([tweet.date, tweet.id, tweet.content])
        temp = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text'])
        df = pd.concat([df,temp])
    return df

def processDateRange(date_range):
    dates = [datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S') for date in date_range.strftime('%Y-%m-%d %H:%M:%S')]
    dates = [date.strftime('%Y-%m-%d') for date in dates]
    return dates    

In [238]:
import warnings

dfs = []
date_range = pd.date_range(start=startDate, end=endDate)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for ticker in memeStocks:
        df = scrapeForDate(ticker, date_range)
        dfs.append(df)

In [None]:
for i,df in enumerate(dfs):
    df['Ticker'] = memeStocks[i]
    df.to_csv(f'Data/Tweets_new/{memeStocks[i]}_Tweets.csv')

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
files = [os.path.join(f'{os.getcwd()}/Data/Tweets_new',path) for path in os.listdir('Data/Tweets')]
tweets = [pd.read_csv(file,index_col=0,engine='python') for file in files]

analyzer = SentimentIntensityAnalyzer()
for i in range(len(tweets)):
    tweets[i].dropna(inplace=True)
    scores = []
    for j, row in tweets[i].iterrows():
        vs = analyzer.polarity_scores(row['Text'])['compound']
        scores.append(vs)
    tweets[i]['polarity_score'] = scores

**Score Interpretation** <br>
positive sentiment: compound score >= 0.05 <br>
neutral sentiment: (compound score > -0.05) and (compound score < 0.05) <br>
negative sentiment: compound score <= -0.05


In [None]:
for i in range(len(tweets)):
    tweets[i]['Datetime'] = pd.to_datetime(tweets[i]['Datetime'])
    tweets[i]['Datetime'] = tweets[i]['Datetime'].dt.date
    tweets[i].rename(columns={"Datetime": "Date"},inplace=True)

tweet_df = pd.DataFrame()
for i in range(len(tweets)):    
    temp=tweets[i].groupby('Date')['polarity_score'].mean().reset_index()
    temp['Ticker']=tweets[i]['Ticker'].values[0]
    tweet_df=pd.concat([tweet_df, temp])
tweet_df['Date'] = pd.to_datetime(tweet_df['Date'])

df = pd.merge(df, tweet_df, on=['Date','Ticker'], how='inner')
df.to_csv('finalizedDataset.csv')

df.head(10)

Unnamed: 0,^GSPC,^TNX,Date,Ticker,Price_Stock,Price_Log_Return,polarity_score
0,0.005053,0.062507,2022-03-25,AAPL,174.720001,0.003727,0.140153
1,0.00712,-0.006037,2022-03-28,AAPL,175.600006,0.005024,0.199839
2,0.012182,-0.031579,2022-03-29,AAPL,178.960007,0.018954,0.220375
3,-0.006314,-0.017655,2022-03-30,AAPL,177.770004,-0.006672,0.183917
4,-0.015776,-0.013234,2022-03-31,AAPL,174.610001,-0.017936,0.213395
5,0.003404,0.021259,2022-04-01,AAPL,174.309998,-0.00172,0.185623
6,0.008058,0.014617,2022-04-04,AAPL,178.440002,0.023417,0.247065
7,-0.012631,0.057987,2022-04-05,AAPL,175.059998,-0.019124,0.205817
8,-0.009764,0.020523,2022-04-06,AAPL,171.830002,-0.018623,0.113833
9,0.004244,0.016347,2022-04-07,AAPL,172.139999,0.001802,0.247603
