# Step 2A: Prepare reddit data for BERT

## 1. Import required libraries

In [1]:
import pandas as pd
from datetime import datetime as dt
import os
import re
from get_all_tickers import get_tickers as gt

*Set display options for dataframe so we can scroll and see all columns*

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Read reddit data extracted in step 1A

*Reddit data was extracted day wise and saved so we loop and read all the saved files into a dataframe*

In [3]:
path = r"C:\\Users\\Karthik\\Desktop\\Dissertation\\Reddit\\Pickle_Files\\daily_pickle_files\\"
files = os.listdir(path)
# display(files)

In [4]:
comments_df = pd.concat([pd.read_pickle(f'{path}{f}') for f in files], axis=0)

*Check the list of columns in the data. We only required a few*

In [5]:
comments_df.columns

Index(['all_awardings', 'approved_at_utc', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'banned_at_utc', 'body', 'can_mod_post', 'collapsed',
       'collapsed_because_crowd_control', 'collapsed_reason', 'comment_type',
       'created_utc', 'distinguished', 'edited', 'gildings', 'id',
       'is_submitter', 'link_id', 'locked', 'no_follow', 'parent_id',
       'permalink', 'retrieved_on', 'score', 'send_replies', 'stickied',
       'subreddit', 'subreddit_id', 'top_awarded_type',
       'total_awards_received', 'treatment_tags', 'author_cakeday', 'top',
       'media_metadata'],
      dtype='object')

*Check that data is available from both 2020 and 2021. This is just an additional check to see that data has been extracted properl across years*

In [6]:
comments_df.created_utc.apply(lambda x: dt.fromtimestamp(x).strftime('%y')).unique()

array(['21', '20'], dtype=object)

## 3. Clean/ filter the data

### 3.1 *Compute a column to differenetiate 'top' level comments. We only consider the top level comments in our study. Focussing on the entire comment tree is out of scope.*

In [7]:
comments_df['top'] = comments_df.apply(lambda x: 'top' if x['link_id'] == x['parent_id'] else 'low', axis=1)

### 3.2 *Pick only the required columns*

In [8]:
comments_df = comments_df[['id', 'created_utc', 'body', 'top', 'score', 'author']]

In [9]:
display(comments_df.shape)
display(comments_df.head())
display(comments_df.tail())

(2431243, 6)

Unnamed: 0,id,created_utc,body,top,score,author
0,ghp4kzz,1609498834,gay,top,1,nosalute
1,ghp4mto,1609498865,Daily discussion? About whattt,top,1,mtfkgentleman
2,ghp4myq,1609498866,PLTR 100 2022,top,1,helpmeinvestx
3,ghp4q59,1609498913,Twenty twenty won.,top,1,LeftHandPuppet
4,ghp4rb9,1609498930,"So Mark, how's your sex life?",low,1,Jerbsybear


Unnamed: 0,id,created_utc,body,top,score,author
2549,gsyzwi4,1617233696,Did you get out of these for a gain in January?,low,1,Teelanoob
2550,gun7m3t,1618517434,aged like fine wine,low,2,adjacent-analyst
2551,h2949aq,1624049611,This aged well 😉,low,2,DeadDharma
2552,h294qjw,1624049830,😆 touché. I got in on a little of the action. But def could have made a ton more.,low,1,tplee
2553,h294ykh,1624049934,[removed],low,1,[deleted]


### 3.3 *Filter only top level comments and remove records where the comments were removed/deleted*

In [10]:
comments_df = comments_df.query('top=="top"')
comments_df = comments_df.query('body!="[removed]"')
comments_df = comments_df.query('body!="[deleted]"')
comments_df.shape

(948183, 6)

### 3.4 *Extract year, month and date into separate columns as this data will be used further downstream to join stock technical data*

In [11]:
comments_df['year'] = comments_df['created_utc'].apply(lambda x: dt.fromtimestamp(x).strftime('%y'))
comments_df['month'] = comments_df['created_utc'].apply(lambda x: dt.fromtimestamp(x).strftime('%b'))
comments_df['day'] = comments_df['created_utc'].apply(lambda x: dt.fromtimestamp(x).strftime('%d'))
display(comments_df.shape)
comments_df.head()

(948183, 9)

Unnamed: 0,id,created_utc,body,top,score,author,year,month,day
0,ghp4kzz,1609498834,gay,top,1,nosalute,21,Jan,1
1,ghp4mto,1609498865,Daily discussion? About whattt,top,1,mtfkgentleman,21,Jan,1
2,ghp4myq,1609498866,PLTR 100 2022,top,1,helpmeinvestx,21,Jan,1
3,ghp4q59,1609498913,Twenty twenty won.,top,1,LeftHandPuppet,21,Jan,1
5,ghp4rkd,1609498935,I think wsb went full retard,top,1,everlastingdeath,21,Jan,1


In [12]:
comments_df.year.unique()

array(['21', '20'], dtype=object)

### 3.5 *Clean the comments*
- Remove urls
- Remove parsing errors
- Remove additional white spaces
- Remove newline characters

In [13]:
def text_preprocessing(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'&amp;', '&', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\!\[gif\]\(.+\)', '', text)
    text = re.sub(r'\(\/message.*\/r\/wallstreetbets\)', '', text)
    text = re.sub(r'[\[\]]', '', text)
    return text

In [14]:
comments_df['original_body'] = comments_df['body']
comments_df['body'] = comments_df.body.apply(lambda x: text_preprocessing(x))
comments_df.head()

Unnamed: 0,id,created_utc,body,top,score,author,year,month,day,original_body
0,ghp4kzz,1609498834,gay,top,1,nosalute,21,Jan,1,gay
1,ghp4mto,1609498865,Daily discussion? About whattt,top,1,mtfkgentleman,21,Jan,1,Daily discussion? About whattt
2,ghp4myq,1609498866,PLTR 100 2022,top,1,helpmeinvestx,21,Jan,1,PLTR 100 2022
3,ghp4q59,1609498913,Twenty twenty won.,top,1,LeftHandPuppet,21,Jan,1,Twenty twenty won.
5,ghp4rkd,1609498935,I think wsb went full retard,top,1,everlastingdeath,21,Jan,1,I think wsb went full retard


In [15]:
# comments_df[comments_df['body'].str.contains("\[")]

## 4. Get the ticker symbols in each tweet

### 4.1 *Get the list of tickers mentioned in each comment*

In [None]:
comments_df.reset_index(drop=True, inplace=True)

In [17]:
ticker_df = pd.read_csv("C:\\Users\\Karthik\\Desktop\\Dissertation\\Tickers\\tickers.csv")
ticker_df

ticker_list = ticker_df['Tickers'].to_list()
# print(ticker_list)

In [18]:
comments_df['ticker']=""
#display(comments_df.head())


for index, row in comments_df.iterrows():
    temp_list=[]
    for word in row['body'].split():
        #if word in ticker_list or word.replace('$', '') in ticker_list:
        if re.sub("[$,.'?!&*:;]","",word) in ticker_list:
            temp_list.append(re.sub("[$,.'?!&*:;]","",word))
        
        if len(temp_list) != 0:
            comments_df.at[index, 'ticker'] = list(dict.fromkeys(temp_list))

comments_df.head(15)

Unnamed: 0,id,created_utc,body,top,score,author,year,month,day,original_body,ticker
0,ghp4kzz,1609498834,gay,top,1,nosalute,21,Jan,1,gay,
1,ghp4mto,1609498865,Daily discussion? About whattt,top,1,mtfkgentleman,21,Jan,1,Daily discussion? About whattt,
2,ghp4myq,1609498866,PLTR 100 2022,top,1,helpmeinvestx,21,Jan,1,PLTR 100 2022,[PLTR]
3,ghp4q59,1609498913,Twenty twenty won.,top,1,LeftHandPuppet,21,Jan,1,Twenty twenty won.,
4,ghp4rkd,1609498935,I think wsb went full retard,top,1,everlastingdeath,21,Jan,1,I think wsb went full retard,
5,ghp4x02,1609499011,I’m not even drunk anymore how am I supposed to fall asleep..,top,1,Sirjackwagon,21,Jan,1,I’m not even drunk anymore how am I supposed to fall asleep..,
6,ghp4y9c,1609499029,I miss shkreli 😞,top,1,Treecrust,21,Jan,1,I miss shkreli 😞,
7,ghp54fb,1609499126,Just called Cathie an Uber home,top,1,spliffykillah,21,Jan,1,Just called Cathie an Uber home,
8,ghp55p7,1609499147,I just posted probably the most autistic rap ever to grace this sub. Hope it helps yall fall asleep(,top,1,YoungGucci66,21,Jan,1,[I just posted probably the most autistic rap ever to grace this sub. Hope it helps yall fall asleep](https://www.reddit.com/r/wallstreetbets/comments/ko9e63/wallstreetbets_anthem_elon_raps_to_pump_tsla_to/),
9,ghp5c79,1609499249,"My current investment thesis: Buy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. So I'm in on GME, Macys, Bed Bath & Beyond",top,1,thenomadicmonad,21,Jan,1,"My current investment thesis: \n\n\nBuy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. \n\n\nSo I'm in on GME, Macys, Bed Bath &amp; Beyond",[GME]


### 4.2 *Pick only comments which talk about a single ticker*

In [19]:
comments_df = comments_df[comments_df['ticker'].str.len()==1]
display(comments_df.shape)
display(comments_df.head(15))

(328505, 11)

Unnamed: 0,id,created_utc,body,top,score,author,year,month,day,original_body,ticker
2,ghp4myq,1609498866,PLTR 100 2022,top,1,helpmeinvestx,21,Jan,1,PLTR 100 2022,[PLTR]
9,ghp5c79,1609499249,"My current investment thesis: Buy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. So I'm in on GME, Macys, Bed Bath & Beyond",top,1,thenomadicmonad,21,Jan,1,"My current investment thesis: \n\n\nBuy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. \n\n\nSo I'm in on GME, Macys, Bed Bath &amp; Beyond",[GME]
10,ghp5d43,1609499263,Next two weeks are going to be super interesting for PSTH. Hope it’s Chic-Fil-A I want some chimkins 🚀🐓🚀,top,1,Magabeef,21,Jan,1,Next two weeks are going to be super interesting for PSTH. Hope it’s Chic-Fil-A I want some chimkins 🚀🐓🚀,[PSTH]
22,ghp69t1,1609499753,Fuck I hate my family. Goddamn ignorant dipshits. Telling a fucking contractor/engineer/teacher I don’t know anything. Add that to my hip aching in an odd way I’ve never experienced before and 2021 is already off to a specfuckingtacular start. Calls on AAPL,top,1,Battle-Santa,21,Jan,1,Fuck I hate my family. \n\nGoddamn ignorant dipshits. Telling a fucking contractor/engineer/teacher I don’t know anything.\n\nAdd that to my hip aching in an odd way I’ve never experienced before and 2021 is already off to a specfuckingtacular start.\n\nCalls on AAPL,[AAPL]
25,ghp6muz,1609499943,Where can I buy GME 100c?,top,1,Seicho,21,Jan,1,Where can I buy GME 100c?,[GME]
30,ghp72zs,1609500199,Non-troll post. I started 2 weeks ago and have lost $1.5k. Not that bothered as I'm still learning but obviously losses annoy you. Been FOMOing TSLA. 2 hours before close I saw it going up and down so thought I'd put a 12/31 695 put on it (I couldn't afford any of the calls). I think I immediately gained $200 after 4 minutes and sold. Did the same thing again 3 time and had pretty much made by all of my loses. Finally wanted to go big and went for a 685p with 1 hour left. This fucked me up and so did theta. Had a chance to breakeven with 30 minutes left but I didn't so ended up only making back $100 that day. My question is - with that kind of strategy (and obviously not being an idiot and going in 45mins before close) it seems good gains are possible. Same day expiring contracts are more affordable and I just have to set an automatic sell to take 20% profit as soon as that target hits. Is this something that I can repeat again without being stupid? Is TSLA the only thing this really works for because it's volatile? And I'm right in thinking this should only be done on Fridays because 0 day contracts are cheaper. Happy New Year,top,1,milezy,21,Jan,1,Non-troll post. I started 2 weeks ago and have lost $1.5k. Not that bothered as I'm still learning but obviously losses annoy you. \nBeen FOMOing TSLA. 2 hours before close I saw it going up and down so thought I'd put a 12/31 695 put on it (I couldn't afford any of the calls). I think I immediately gained $200 after 4 minutes and sold. Did the same thing again 3 time and had pretty much made by all of my loses. \nFinally wanted to go big and went for a 685p with 1 hour left. This fucked me up and so did theta. Had a chance to breakeven with 30 minutes left but I didn't so ended up only making back $100 that day. \nMy question is - with that kind of strategy (and obviously not being an idiot and going in 45mins before close) it seems good gains are possible. Same day expiring contracts are more affordable and I just have to set an automatic sell to take 20% profit as soon as that target hits. \nIs this something that I can repeat again without being stupid? Is TSLA the only thing this really works for because it's volatile? And I'm right in thinking this should only be done on Fridays because 0 day contracts are cheaper. \nHappy New Year,[TSLA]
34,ghp7g8l,1609500388,Anyone think BABA goes up from here?,top,1,ronstoppable7,21,Jan,1,Anyone think BABA goes up from here?,[BABA]
41,ghpp4zc,1609509996,Who out here with 1500 shares of GME with the average cost being 7.97?,top,1,dirtyshits,21,Jan,1,Who out here with 1500 shares of GME with the average cost being 7.97?,[GME]
45,ghppdr8,1609510108,Anyone know how PLTR is doing in premarket? I can't get it to update. I have 10k in weeklies that went in yesterday and I need it to 🚀 today,top,1,JackLocke366,21,Jan,1,Anyone know how PLTR is doing in premarket? I can't get it to update. I have 10k in weeklies that went in yesterday and I need it to 🚀 today,[PLTR]
46,ghppi7u,1609510168,"Monthly report: my only profitable trade was a GME short, everything else is red. Is this karma?",top,1,Significant_Ad_8532,21,Jan,1,"Monthly report: my only profitable trade was a GME short, everything else is red. Is this karma?",[GME]


*Convert the ticker data which is in list form to string form*

In [21]:
singular_comments_df_list = []

for index, row in comments_df.iterrows():
    for i in row['ticker']:
        singular_comments_df_list.append([row['body'],row['created_utc'],row['id'],row['top'],row['year'],row['month'],row['day'],i])

In [22]:
refined_df = pd.DataFrame(singular_comments_df_list, columns = ['body','created_utc','id','top','year','month','day','ticker'])
refined_df.head(10)

Unnamed: 0,body,created_utc,id,top,year,month,day,ticker
0,PLTR 100 2022,1609498866,ghp4myq,top,21,Jan,1,PLTR
1,"My current investment thesis: Buy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. So I'm in on GME, Macys, Bed Bath & Beyond",1609499249,ghp5c79,top,21,Jan,1,GME
2,Next two weeks are going to be super interesting for PSTH. Hope it’s Chic-Fil-A I want some chimkins 🚀🐓🚀,1609499263,ghp5d43,top,21,Jan,1,PSTH
3,Fuck I hate my family. Goddamn ignorant dipshits. Telling a fucking contractor/engineer/teacher I don’t know anything. Add that to my hip aching in an odd way I’ve never experienced before and 2021 is already off to a specfuckingtacular start. Calls on AAPL,1609499753,ghp69t1,top,21,Jan,1,AAPL
4,Where can I buy GME 100c?,1609499943,ghp6muz,top,21,Jan,1,GME
5,Non-troll post. I started 2 weeks ago and have lost $1.5k. Not that bothered as I'm still learning but obviously losses annoy you. Been FOMOing TSLA. 2 hours before close I saw it going up and down so thought I'd put a 12/31 695 put on it (I couldn't afford any of the calls). I think I immediately gained $200 after 4 minutes and sold. Did the same thing again 3 time and had pretty much made by all of my loses. Finally wanted to go big and went for a 685p with 1 hour left. This fucked me up and so did theta. Had a chance to breakeven with 30 minutes left but I didn't so ended up only making back $100 that day. My question is - with that kind of strategy (and obviously not being an idiot and going in 45mins before close) it seems good gains are possible. Same day expiring contracts are more affordable and I just have to set an automatic sell to take 20% profit as soon as that target hits. Is this something that I can repeat again without being stupid? Is TSLA the only thing this really works for because it's volatile? And I'm right in thinking this should only be done on Fridays because 0 day contracts are cheaper. Happy New Year,1609500199,ghp72zs,top,21,Jan,1,TSLA
6,Anyone think BABA goes up from here?,1609500388,ghp7g8l,top,21,Jan,1,BABA
7,Who out here with 1500 shares of GME with the average cost being 7.97?,1609509996,ghpp4zc,top,21,Jan,1,GME
8,Anyone know how PLTR is doing in premarket? I can't get it to update. I have 10k in weeklies that went in yesterday and I need it to 🚀 today,1609510108,ghppdr8,top,21,Jan,1,PLTR
9,"Monthly report: my only profitable trade was a GME short, everything else is red. Is this karma?",1609510168,ghppi7u,top,21,Jan,1,GME


### 4.3 *Compute YearMonDay which is year+month+day. This will help to groupby and join to price data downstream*

In [23]:
refined_df['YearMonDay'] = refined_df['year']  + refined_df['month'] + refined_df['day'].astype(str)
refined_df.head(10)

Unnamed: 0,body,created_utc,id,top,year,month,day,ticker,YearMonDay
0,PLTR 100 2022,1609498866,ghp4myq,top,21,Jan,1,PLTR,21Jan01
1,"My current investment thesis: Buy whatever is shorted the most. Shorting works when shit is based on fundamentals. When shits decoupled, shorting just leads to cycles of highs and lows that draw in other investors and increase the price gradually. So I'm in on GME, Macys, Bed Bath & Beyond",1609499249,ghp5c79,top,21,Jan,1,GME,21Jan01
2,Next two weeks are going to be super interesting for PSTH. Hope it’s Chic-Fil-A I want some chimkins 🚀🐓🚀,1609499263,ghp5d43,top,21,Jan,1,PSTH,21Jan01
3,Fuck I hate my family. Goddamn ignorant dipshits. Telling a fucking contractor/engineer/teacher I don’t know anything. Add that to my hip aching in an odd way I’ve never experienced before and 2021 is already off to a specfuckingtacular start. Calls on AAPL,1609499753,ghp69t1,top,21,Jan,1,AAPL,21Jan01
4,Where can I buy GME 100c?,1609499943,ghp6muz,top,21,Jan,1,GME,21Jan01
5,Non-troll post. I started 2 weeks ago and have lost $1.5k. Not that bothered as I'm still learning but obviously losses annoy you. Been FOMOing TSLA. 2 hours before close I saw it going up and down so thought I'd put a 12/31 695 put on it (I couldn't afford any of the calls). I think I immediately gained $200 after 4 minutes and sold. Did the same thing again 3 time and had pretty much made by all of my loses. Finally wanted to go big and went for a 685p with 1 hour left. This fucked me up and so did theta. Had a chance to breakeven with 30 minutes left but I didn't so ended up only making back $100 that day. My question is - with that kind of strategy (and obviously not being an idiot and going in 45mins before close) it seems good gains are possible. Same day expiring contracts are more affordable and I just have to set an automatic sell to take 20% profit as soon as that target hits. Is this something that I can repeat again without being stupid? Is TSLA the only thing this really works for because it's volatile? And I'm right in thinking this should only be done on Fridays because 0 day contracts are cheaper. Happy New Year,1609500199,ghp72zs,top,21,Jan,1,TSLA,21Jan01
6,Anyone think BABA goes up from here?,1609500388,ghp7g8l,top,21,Jan,1,BABA,21Jan01
7,Who out here with 1500 shares of GME with the average cost being 7.97?,1609509996,ghpp4zc,top,21,Jan,1,GME,21Jan01
8,Anyone know how PLTR is doing in premarket? I can't get it to update. I have 10k in weeklies that went in yesterday and I need it to 🚀 today,1609510108,ghppdr8,top,21,Jan,1,PLTR,21Jan01
9,"Monthly report: my only profitable trade was a GME short, everything else is red. Is this karma?",1609510168,ghppi7u,top,21,Jan,1,GME,21Jan01


In [24]:
display(comments_df.shape)
display(refined_df.shape)

(328505, 11)

(328505, 9)

*Check the number of individual days for which we have data*

In [25]:
refined_df.YearMonDay.nunique()

303

## 5. Analyse the data to see which tickers are favourable for analysis

### 5.1 *List the tickers and the number of days where their comments are available in the extracted data*

In [26]:
tick_list = refined_df.groupby('ticker').YearMonDay.nunique().sort_values(ascending=False)
# tick_list.sort('mygroups', ascending=False)

In [27]:
tick_list.where(lambda x: x>=150).dropna()

ticker
TSLA    261.0
AAPL    258.0
AMD     256.0
RH      247.0
AMZN    247.0
DKNG    236.0
BABA    235.0
NIO     232.0
SPCE    229.0
MSFT    228.0
ATH     226.0
DD      226.0
BA      221.0
IS      221.0
SQ      220.0
FB      213.0
NVDA    212.0
GME     211.0
RKT     202.0
GO      198.0
YOU     198.0
UP      192.0
IT      192.0
ON      191.0
SP      186.0
PLTR    186.0
PTON    184.0
DIS     184.0
NKLA    183.0
AMC     180.0
AAL     178.0
PLUG    178.0
CRSR    173.0
CEO     172.0
CCL     171.0
PM      170.0
LMAO    170.0
SNAP    166.0
NET     165.0
ME      162.0
ZM      160.0
GE      158.0
AM      158.0
ARE     158.0
ALL     156.0
BB      154.0
WMT     153.0
NOK     153.0
NOW     151.0
Name: YearMonDay, dtype: float64

### 5.2 *Choose 5 tickers which have comments on most days and save them for further analysis*

In [28]:
req_tickers = ['AAPL', 'AMC', 'DKNG', 'TSLA', 'AMD', 'BABA']

In [30]:
for i in req_tickers:
    temp_df = refined_df.query(f'ticker=="{i}"')
    temp_df.reset_index(drop=True, inplace=True)
    temp_df.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Reddit\\consolidated_pickle_files\\reddit_{i}_df_for_BERT.pkl")