# Step 2B: Prepare twitter data for BERT

## 1. Import required libraries

In [1]:
import pandas as pd
from datetime import datetime as dt
import os
import re
from get_all_tickers import get_tickers as gt

*Set display options for dataframe so we can scroll and see all columns*

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Read the twitter data extracted from 1B

*Twitter data was extracted for 5 tickers and data was saved ticker wise. Read all data into a dataframe*

In [3]:
path = r"C:\\Users\\Karthik\\Desktop\\Dissertation\\Twitter\\tweets\\"
files = os.listdir(path)
display(files)

['text-query-tweets-AAPL-2021-09-30_2020-06-01.csv',
 'text-query-tweets-AMC-2021-09-30_2020-06-01.csv',
 'text-query-tweets-AMD-2021-09-30_2020-06-01.csv',
 'text-query-tweets-BABA-2021-09-30_2020-06-01.csv',
 'text-query-tweets-DKNG-2021-09-30_2020-06-01.csv',
 'text-query-tweets-TSLA-2021-09-30_2020-06-01.csv']

In [4]:
comments_df = pd.concat([pd.read_csv(f'{path}{f}') for f in files], axis=0)

*Check the list of columns in the data. We only required a few*

In [5]:
comments_df.columns

Index(['_type', 'url', 'date', 'content', 'renderedContent', 'id', 'user',
       'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel',
       'outlinks', 'tcooutlinks', 'media', 'retweetedTweet', 'quotedTweet',
       'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 'coordinates',
       'place', 'hashtags', 'cashtags'],
      dtype='object')

*Check that data is available from both 2020 and 2021. This is just an additional check to see that data has been extracted properl across years*

In [6]:
comments_df.head(1)

Unnamed: 0,_type,url,date,content,renderedContent,id,user,replyCount,retweetCount,likeCount,...,media,retweetedTweet,quotedTweet,inReplyToTweetId,inReplyToUser,mentionedUsers,coordinates,place,hashtags,cashtags
0,snscrape.modules.twitter.Tweet,https://twitter.com/iamfkr2/status/1443727338575958019,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.","Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1443727338575958019,"{'_type': 'snscrape.modules.twitter.User', 'username': 'iamfkr2', 'id': 1058462404013707265, 'displayname': 'iamfkr', 'description': 'MD_At_Craftee4k|❤|Intagramer|🤙street_Food_Lover 😊photographer🔥Teacher 😇AiMiy♥Designer✌LegeNd 💯✌ Muslim l ♥Pakistani 🇵🇰 \nIG/FB : iamfkr', 'rawDescription': 'MD_At_Craftee4k|❤|Intagramer|🤙street_Food_Lover 😊photographer🔥Teacher 😇AiMiy♥Designer✌LegeNd 💯✌ Muslim l ♥Pakistani 🇵🇰 \nIG/FB : iamfkr', 'descriptionUrls': None, 'verified': False, 'created': '2018-11-02T20:54:37+00:00', 'followersCount': 8, 'friendsCount': 151, 'statusesCount': 24, 'favouritesCount': 152, 'listedCount': 1, 'mediaCount': 5, 'location': 'Karachi, Pakistan', 'protected': False, 'linkUrl': 'https://www.instagram.com/iamfkr/', 'linkTcourl': 'https://t.co/YAZZaXH1AL', 'profileImageUrl': 'https://pbs.twimg.com/profile_images/1303207258902364160/TBC4J40N_normal.jpg', 'profileBannerUrl': 'https://pbs.twimg.com/profile_banners/1058462404013707265/1541193251', 'label': None, 'url': 'https://twitter.com/iamfkr2'}",0,0,1,...,,,,,,"[{'_type': 'snscrape.modules.twitter.User', 'username': 'PeterSchiff', 'id': 56562803, 'displayname': 'Peter Schiff', 'description': None, 'rawDescription': None, 'descriptionUrls': None, 'verified': None, 'created': None, 'followersCount': None, 'friendsCount': None, 'statusesCount': None, 'favouritesCount': None, 'listedCount': None, 'mediaCount': None, 'location': None, 'protected': None, 'linkUrl': None, 'linkTcourl': None, 'profileImageUrl': None, 'profileBannerUrl': None, 'label': None, 'url': 'https://twitter.com/PeterSchiff'}]",,,,"['BTC', 'TSLA', 'NVDA', 'NFLX', 'AMZN', 'AAPL', 'SPY', 'AGG', 'GLD']"


## 3. Clean/ filter the data

### 3.1 *Pick only required columns. Filter records with english language and discard others.*

In [7]:
comments_df = comments_df[['date','content','likeCount','lang']]
comments_df = comments_df.query('lang=="en"')
comments_df.head()

Unnamed: 0,date,content,likeCount,lang
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en
4,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en
5,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en


### 3.2 *Clean the tweet content*
- Remove tags
- Remove urls
- Remove parsing errors
- Remove additional white spaces
- Remove newline characters

In [8]:
def text_preprocessing(text):
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'&amp;amp', '&', text)
    text = re.sub(r'\&amp;', '&', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
comments_df['cleaned_content'] = comments_df.content.apply(lambda x: text_preprocessing(x))
comments_df.head()

Unnamed: 0,date,content,likeCount,lang,cleaned_content
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en,"Returns over the last 10 Years: Bitcoin $BTC: +994,608% Tesla $TSLA: +15,200% NVIDIA $NVDA: +6,053% Netflix $NFLX: +2,337% Amazon $AMZN: +1,427% Apple $AAPL: +1,112% S&P 500 $SPY: +344% Bonds $AGG: +35% Gold $GLD: -6% literally picked the worst performing asset."
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en,told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl
4,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL"
5,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:


*check for data issues after cleaning*

In [10]:
comments_df[comments_df['cleaned_content'].str.contains("&amp")]

Unnamed: 0,date,content,likeCount,lang,cleaned_content


### 3.3 *Populate month and day from the tweet date*

In [12]:
comments_df['year'] = comments_df['date'].apply(lambda x: dt.strptime(x[0:10],'%Y-%m-%d').strftime('%y'))
comments_df['month'] = comments_df['date'].apply(lambda x: dt.strptime(x[0:10],'%Y-%m-%d').strftime('%b'))
comments_df['day'] = comments_df['date'].apply(lambda x: dt.strptime(x[0:10],'%Y-%m-%d').strftime('%d'))

comments_df.head()

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en,"Returns over the last 10 Years: Bitcoin $BTC: +994,608% Tesla $TSLA: +15,200% NVIDIA $NVDA: +6,053% Netflix $NFLX: +2,337% Amazon $AMZN: +1,427% Apple $AAPL: +1,112% S&P 500 $SPY: +344% Bonds $AGG: +35% Gold $GLD: -6% literally picked the worst performing asset.",21,Sep,30
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,21,Sep,30
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en,told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,21,Sep,30
4,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30
5,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30


## 4. Get the ticker symbols in each tweet

### 4.1 *Get the list of tickers mentioned in each comment*

In [13]:
comments_df.reset_index(drop=True, inplace=True)
comments_df.head()

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en,"Returns over the last 10 Years: Bitcoin $BTC: +994,608% Tesla $TSLA: +15,200% NVIDIA $NVDA: +6,053% Netflix $NFLX: +2,337% Amazon $AMZN: +1,427% Apple $AAPL: +1,112% S&P 500 $SPY: +344% Bonds $AGG: +35% Gold $GLD: -6% literally picked the worst performing asset.",21,Sep,30
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,21,Sep,30
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en,told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,21,Sep,30
3,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30
4,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30


In [14]:
ticker_df = pd.read_csv("C:\\Users\\Karthik\\Desktop\\Dissertation\\Tickers\\tickers.csv")
ticker_df

ticker_list = ticker_df['Tickers'].to_list()
# print(ticker_list)

In [16]:
comments_df.shape

(1469208, 9)

In [17]:
comments_df.head()

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en,"Returns over the last 10 Years: Bitcoin $BTC: +994,608% Tesla $TSLA: +15,200% NVIDIA $NVDA: +6,053% Netflix $NFLX: +2,337% Amazon $AMZN: +1,427% Apple $AAPL: +1,112% S&P 500 $SPY: +344% Bonds $AGG: +35% Gold $GLD: -6% literally picked the worst performing asset.",21,Sep,30,"[TSLA, NVDA, NFLX, AMZN, AAPL]"
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,21,Sep,30,
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en,told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,21,Sep,30,
3,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30,[AAPL]
4,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30,[AAPL]


In [18]:
comments_df['ticker']=""
#display(comments_df.head())


for index, row in comments_df.iterrows():
    temp_list=[]
    for word in row['content'].split():
        #if word in ticker_list or word.replace('$', '') in ticker_list:
        if re.sub("[$,.'?!&*:;]","",word) in ticker_list:
            temp_list.append(re.sub("[$,.'?!&*:;]","",word))
        
        if len(temp_list) != 0:
            comments_df.at[index, 'ticker'] = list(dict.fromkeys(temp_list))

comments_df.head()

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker
0,2021-09-30 23:59:55+00:00,"Returns over the last 10 Years:\n\nBitcoin $BTC: +994,608%\nTesla $TSLA: +15,200%\nNVIDIA $NVDA: +6,053%\nNetflix $NFLX: +2,337%\nAmazon $AMZN: +1,427%\nApple $AAPL: +1,112%\nS&amp;P 500 $SPY: +344%\nBonds $AGG: +35%\nGold $GLD: -6%\n\n@PeterSchiff literally picked the worst performing asset.",1,en,"Returns over the last 10 Years: Bitcoin $BTC: +994,608% Tesla $TSLA: +15,200% NVIDIA $NVDA: +6,053% Netflix $NFLX: +2,337% Amazon $AMZN: +1,427% Apple $AAPL: +1,112% S&P 500 $SPY: +344% Bonds $AGG: +35% Gold $GLD: -6% literally picked the worst performing asset.",21,Sep,30,"[TSLA, NVDA, NFLX, AMZN, AAPL]"
1,2021-09-30 23:56:28+00:00,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,0,en,You made or lost money trading in Sept. 21? $spy $dia $iwm $gld $uco $xly $xlp $xlu $xlv $xlb $xli $xlf $xle $xlk $jpm $gs $amd $aapl $tsla $pypl $twtr $sq $intc $wmt $pfe $jnj $ba $cat $ups $dis $nke $wba $mat $fslr $hog $ms $dhi $bby $hd $yum $ko $fb $hpq $sbux $vz $fxi $ezu,21,Sep,30,
2,2021-09-30 23:56:14+00:00,@StokesCorner @ATT told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,1,en,told my sister that they are back ordered on 13’s for next 2 months 🤷‍♂️ 😊 🚀 re $aapl,21,Sep,30,
3,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30,[AAPL]
4,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30,[AAPL]


### 4.2 *Pick only tweets which talk about a single ticker*

In [19]:
singular_comments_df_list = []

for index, row in comments_df.iterrows():
    #for i in row['ticker']:
    if len(row['ticker']) == 1:
        singular_comments_df_list.append([row['date'],row['content'],row['likeCount'],row['lang'],row['cleaned_content'],row['year'],row['month'],row['day'],row['ticker'][0]])

In [20]:
refined_df = pd.DataFrame(singular_comments_df_list, columns = ['date','content','likeCount','lang','cleaned_content','year','month','day','ticker'])
refined_df.head(100)

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker
0,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30,AAPL
1,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30,AAPL
2,2021-09-30 23:53:37+00:00,"In iOS 15, Apple's Siri gets even less functional https://t.co/Q1Mlvy21cm $AAPL",0,en,"In iOS 15, Apple's Siri gets even less functional $AAPL",21,Sep,30,AAPL
3,2021-09-30 23:53:35+00:00,iPhone 13 Models Still Include EarPods in the Box in France https://t.co/wKZJwPBOja $AAPL https://t.co/jDObTshZGn,0,en,iPhone 13 Models Still Include EarPods in the Box in France $AAPL,21,Sep,30,AAPL
4,2021-09-30 23:53:33+00:00,Original Apple Watch Added to Apple's 'Vintage Products' List https://t.co/L1sCJXkOy2 $AAPL https://t.co/bxl2U2eq3K,0,en,Original Apple Watch Added to Apple's 'Vintage Products' List $AAPL,21,Sep,30,AAPL
...,...,...,...,...,...,...,...,...,...
95,2021-09-30 19:24:31+00:00,Swedish Sex-Toy Maker LELO Is Said to Explore London Listing $AAPL https://t.co/I9PbaeT4he,0,en,Swedish Sex-Toy Maker LELO Is Said to Explore London Listing $AAPL,21,Sep,30,AAPL
96,2021-09-30 19:22:50+00:00,#Americans die in back of paramedics as #immigrants block only route for first responders to get thru on SF Golden Gate Bridge.\n\n#stock #stocks #StockMarketindia #stockmarket #Ethereum #btc \n$mrma $pfe $azn $pltr $fb $f $aapl $tsla #gold #corm #commodities https://t.co/ge3LgLcb1h,0,en,#Americans die in back of paramedics as #immigrants block only route for first responders to get thru on SF Golden Gate Bridge. #stock #stocks #StockMarketindia #stockmarket #Ethereum #btc $mrma $pfe $azn $pltr $fb $f $aapl $tsla #gold #corm #commodities,21,Sep,30,SF
97,2021-09-30 19:18:02+00:00,$AAPL #APPLE @AppleNews - 15 min.: Has the basic bearish trend slowed or is it lacking in power? The short term does not yet offer any information that would allow us to anticipate a... https://t.co/qEaNwyfgom,0,en,$AAPL #APPLE - 15 min.: Has the basic bearish trend slowed or is it lacking in power? The short term does not yet offer any information that would allow us to anticipate a...,21,Sep,30,AAPL
98,2021-09-30 19:18:02+00:00,$AAPL #APPLE - 15 min.: Has the basic bearish trend slowed or is it lacking in power? The short term does not yet offer any information that would allow us to anticipate a... https://t.co/MXIZA79NaQ,0,en,$AAPL #APPLE - 15 min.: Has the basic bearish trend slowed or is it lacking in power? The short term does not yet offer any information that would allow us to anticipate a...,21,Sep,30,AAPL


### 4.3 *Compute MonDay which is year+month+day. This will help to groupby and join to price data in ensuing notebooks.*

In [21]:
refined_df['YearMonDay'] = refined_df['year'] +refined_df['month'] + refined_df['day'].astype(str)
refined_df.head(10)

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker,YearMonDay
0,2021-09-30 23:54:13+00:00,"I need 1,000 shares of $AAPL",313,en,"I need 1,000 shares of $AAPL",21,Sep,30,AAPL,21Sep30
1,2021-09-30 23:54:01+00:00,Apple | $AAPL\n\nApple - Potential Head and Shoulders\n\nLong or short it with BTC on Trade8: https://t.co/7Y3lSIZqdd https://t.co/DJnjORexYx,0,en,Apple | $AAPL Apple - Potential Head and Shoulders Long or short it with BTC on Trade8:,21,Sep,30,AAPL,21Sep30
2,2021-09-30 23:53:37+00:00,"In iOS 15, Apple's Siri gets even less functional https://t.co/Q1Mlvy21cm $AAPL",0,en,"In iOS 15, Apple's Siri gets even less functional $AAPL",21,Sep,30,AAPL,21Sep30
3,2021-09-30 23:53:35+00:00,iPhone 13 Models Still Include EarPods in the Box in France https://t.co/wKZJwPBOja $AAPL https://t.co/jDObTshZGn,0,en,iPhone 13 Models Still Include EarPods in the Box in France $AAPL,21,Sep,30,AAPL,21Sep30
4,2021-09-30 23:53:33+00:00,Original Apple Watch Added to Apple's 'Vintage Products' List https://t.co/L1sCJXkOy2 $AAPL https://t.co/bxl2U2eq3K,0,en,Original Apple Watch Added to Apple's 'Vintage Products' List $AAPL,21,Sep,30,AAPL,21Sep30
5,2021-09-30 23:47:52+00:00,"Trading tips, how to play halts $BBIG $mara $riot $sos $amc $gme $wish $nuro $btc $spy $iwm $qqq $sndl $nakd $aapl $tsla $cei $rnaz $fami $sdc $doge $eth $dats $snoa $rnxt $tkat $rgc $alzn $sgoc $any $sprt $zivo $capr $palt $blin $zivo $aehr $mdp $vtvt $crvs $hlbz $fami $grom https://t.co/6WGGkl5oMm",1,en,"Trading tips, how to play halts $BBIG $mara $riot $sos $amc $gme $wish $nuro $btc $spy $iwm $qqq $sndl $nakd $aapl $tsla $cei $rnaz $fami $sdc $doge $eth $dats $snoa $rnxt $tkat $rgc $alzn $sgoc $any $sprt $zivo $capr $palt $blin $zivo $aehr $mdp $vtvt $crvs $hlbz $fami $grom",21,Sep,30,BBIG,21Sep30
6,2021-09-30 23:44:20+00:00,$AAPL intraday analysis 9/30 https://t.co/kGrILZVEL5,1,en,$AAPL intraday analysis 9/30,21,Sep,30,AAPL,21Sep30
7,2021-09-30 23:40:45+00:00,Example I’ll give is $AAPL … sure it’s a product that I think has a less elastic consumer. But the real issue stands from China cutting them off…any others that can surprise and maybe generate decent returns on the dreaded short side?,0,en,Example I’ll give is $AAPL … sure it’s a product that I think has a less elastic consumer. But the real issue stands from China cutting them off…any others that can surprise and maybe generate decent returns on the dreaded short side?,21,Sep,30,AAPL,21Sep30
8,2021-09-30 23:39:06+00:00,@DickDollar1 @WallStreetRy @thePike_1 I already own $AAPL,2,en,I already own $AAPL,21,Sep,30,AAPL,21Sep30
9,2021-09-30 23:38:00+00:00,$AAPL - Stocks Drop On September 30 – The Fun Has Merely Begun\n\n#Stocks #Stock,0,en,$AAPL - Stocks Drop On September 30 – The Fun Has Merely Begun #Stocks #Stock,21,Sep,30,AAPL,21Sep30


In [22]:
display(comments_df.shape)
display(refined_df.shape)

(1469208, 9)

(517011, 10)

## 5. Get the five tickers which we will use for our study

### 5.1 *List the tickers and the number of days where their comments are available in the extracted data*
*It can be seen that the five tickers we chose have data for most days*

In [24]:
tick_list = refined_df.groupby('ticker').YearMonDay.nunique().sort_values(ascending=False)
# tick_list.sort('mygroups', ascending=False)
tick_list.where(lambda x: x>=75).dropna()

ticker
BABA    487.0
AMC     487.0
DKNG    487.0
AAPL    487.0
TSLA    487.0
        ...  
MA       78.0
LTC      77.0
BLNK     77.0
SE       76.0
ARK      75.0
Name: YearMonDay, Length: 66, dtype: float64

### 5.2 *Choose 5 tickers which have comments on most days and save them for further analysis*

In [25]:
req_tickers = ['AMC', 'DKNG', 'TSLA', 'AMD', 'BABA']

In [26]:
for i in req_tickers:
    temp_df = refined_df.query(f'ticker=="{i}"')
    temp_df.reset_index(drop=True, inplace=True)
    temp_df.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Twitter\\consolidated_pickle_files\\twitter_{i}_df_for_BERT.pkl")