In [1]:
import pandas as pd
import numpy as np

### Prepare data for further processing

In [3]:
# get tickers and relevant sectors of stocks
stocks_info = pd.read_csv('../sources/s&p500.csv')
stocks_symbol = stocks_info['Symbol']

In [2]:
# filter out delisted stocks
stocks_status = pd.read_csv('../sources/listing_status.csv')
stocks_status = stocks_status[stocks_status['symbol'].isin(np.concatenate((stocks_symbol, ['BRK-B', 'BF-B'])))][['symbol', 'ipoDate']]
dif1 = np.setdiff1d(stocks_symbol, stocks_status['symbol'])
stocks_status = stocks_status.reset_index(drop=True)
stocks_status

Unnamed: 0,symbol,ipoDate
0,A,1999-11-18
1,AAL,2005-09-27
2,AAP,2001-11-29
3,AAPL,1980-12-12
4,ABBV,2013-01-02
...,...,...
495,YUM,1997-09-17
496,ZBH,2001-07-25
497,ZBRA,1991-08-15
498,ZION,1990-03-26


### Extract data from Twitter using snscrape

In [None]:
import snscrape.modules.twitter as sntwitter

# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for s in stocks_symbol:
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper('${} (from:jimcramer) since:1990-01-01 until:2023-12-31'.format(s)).get_items()):
        content = tweet.content.replace('\n', '')
        content = content.replace('\r', '')
        tweets_list.append([tweet.date, s, content, tweet.retweetedTweet])
    break
    
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Symbol', 'Text', 'Retweeted'])

In [86]:
tweets_df.to_csv('../sources/data.csv', index=False)

In [3]:
df = pd.read_csv('../sources/data.csv')

In [4]:
df.head()

Unnamed: 0,Datetime,Symbol,Text,Retweeted
0,2023-01-25 03:05:45+00:00,MMM,To me the worst quarter of the season so far i...,
1,2021-11-30 14:22:02+00:00,MMM,"$MRNA, $SQ, $F, $MMM &amp; more… all covered i...",
2,2020-09-24 08:36:51+00:00,MMM,If you look at the https://t.co/NS3syOn64o you...,
3,2020-08-27 08:46:42+00:00,MMM,thank you to Mike Roman and the great folks at...,
4,2020-01-28 17:44:08+00:00,MMM,the stock needs to see that return.. as this i...,


### Get stock historical data from yfinance

In [6]:
from datetime import datetime, timedelta, date
import numpy as np
import yfinance as yf

# get date string after n days
def addDate(date, day):
    date1 = datetime.strptime(date, '%Y-%m-%d').date()
    date2 = date1 + timedelta(days=day)
    return str(date2)

# calculate return from change of price
def getReturn(old_price, new_price):
    return (new_price - old_price) / old_price

# get stock price given the date
def yfSearch(row, days):
    today = date.today()
    symbol = 'BRK-B' if row['Symbol'] == 'BRK.B' else row['Symbol']
    if stocks_status[stocks_status['symbol'] == symbol]['ipoDate'].values[0] >= row['Datetime'][:10]:
        return np.nan
    ticker = yf.Tickers(symbol)
    adj = 1
    while True:
        # return nan if the request date is later than today
        if addDate(row['Datetime'][:10], days+adj) >= today.strftime("%Y-%m-%d"):
            return np.nan
        try:
            return ticker.tickers[symbol].history(start=addDate(row['Datetime'][:10], days), 
                                                         end=addDate(row['Datetime'][:10], days+adj), 
                                                         raise_errors=True)['Close'].values[0]
        # extend 1 request date if no data on that day
        except:
            adj += 1

### Return current, 30 days, 60 days, 180 days price

In [None]:
df['price'] = df.apply(lambda row: yfSearch(row, 0), axis=1)

In [None]:
df['30_days_price'] = df.apply(lambda row: yfSearch(row, 30), axis=1)

In [None]:
df['90_days_price'] = df.apply(lambda row: yfSearch(row, 90), axis=1)

In [None]:
df['180_days_price'] = df.apply(lambda row: yfSearch(row, 180), axis=1)

In [None]:
df.to_csv('../sources/data.csv', index=False)

In [None]:
df = pd.read_csv('../sources/data.csv')

In [19]:
df

Unnamed: 0,Datetime,Symbol,Text,Retweeted,price,30_days_price,90_days_price,180_days_price
0,2023-01-25 03:05:45+00:00,MMM,To me the worst quarter of the season so far i...,,111.454300,107.800003,,
1,2021-11-30 14:22:02+00:00,MMM,"$MRNA, $SQ, $F, $MMM &amp; more… all covered i...",,160.991837,168.187424,142.100510,144.154526
2,2020-09-24 08:36:51+00:00,MMM,If you look at the https://t.co/NS3syOn64o you...,,145.948441,151.170639,159.663681,174.247040
3,2020-08-27 08:46:42+00:00,MMM,thank you to Mike Roman and the great folks at...,,149.278290,147.076584,162.535919,162.996353
4,2020-01-28 17:44:08+00:00,MMM,the stock needs to see that return.. as this i...,,146.491455,134.052170,137.167786,147.174454
...,...,...,...,...,...,...,...,...
12228,2015-05-05 09:52:02+00:00,ZTS,"When you have $DIS, $CSCO and $EOG, $ZTS you a...",,43.352589,46.408531,46.009922,42.058086
12229,2015-05-05 07:49:28+00:00,ZTS,"Going into overdrive for $CSCO, $DIS, $ZTS, an...",,43.352577,46.408527,46.009914,42.058098
12230,2014-06-09 19:57:42+00:00,ZTS,Is $ATHN a $1000 stock? Is $ZTS the enemy of $...,,30.144522,30.568308,34.238846,41.699692
12231,2013-12-03 01:45:23+00:00,ZTS,"Must Buy, $ZTS RT @codybarbo: ""Americans spend...",,29.139360,30.329683,29.112482,29.118053
