### Merge tweets and stock price dataframes
Load all cleaned tweets, which are saved in separate files, load all stock prices and combine them in single dataframe grouped by date.

In [191]:
import os
import pandas as pd
# reset colwitdth options when running all cells 
pd.reset_option('display.max_colwidth')
pd.__version__

'0.25.3'

In [192]:
# read in all the tweets
tweets_dfs = []
directory = '../data/processed/final_strict/'
for subdir, dirs, files in os.walk(directory):
    for file in files:
        tweets_dfs.append(pd.read_json(directory+file))
        
tweets = pd.concat(tweets_dfs)

In [193]:
tweets

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets
105,2018-01-01 23:33:24,[],Notts Gaussfest @Papplewick_PS Sat 17th March ...,Extreme Electronics,4,0,5
155,2018-01-01 23:18:30,[],*hits blunt* if you're drunk and put your tesl...,the finesse kid,3,0,4
168,2018-01-01 23:15:01,[],"“It is paradoxical, yet true, to say, that the...",Á̵̢̙͉̫ 𝐫 𝐝 𝐢 𝐧 ≤≥ 🌐,18,1,3
186,2018-01-01 23:06:29,[],"Hey Malcolm, the Tesla batteries are working b...",Chris Cullen,108,5,63
231,2018-01-01 22:55:53,[],"Since it keeps coming up, this is the main pro...",Benedict Evans,92,13,19
...,...,...,...,...,...,...,...
699014,2018-04-02 00:21:17,[],Journalists and @Tesla bears are gleefully boo...,Tim Culpan,28,0,12
699091,2018-04-02 00:13:49,"[AprilFoolsDay, flamethrowers]",.@elonmusk turns @Tesla's troubles into an #Ap...,Alan Boyle,13,3,3
699103,2018-04-02 00:12:06,[],Just got word from a real estate developer who...,David Tayar,20,6,7
699144,2018-04-02 00:08:39,[],"La muerte de Walter Huang, la primera de un co...",Última Hora,34,5,35


In [194]:
# uncomment lines below to generate a dataframe report
# CAUTION: TAKES ABOUT AN HOUR
# import pandas_profiling as profile
# profile = profile.ProfileReport(tweets, title='Daily Tweets Profiling Report', html={'style':{'full_width':True}})
# profile.to_file(output_file="merged_tweets_less_strict_df_report.html")

In [195]:
# the dataframe report has shown that there are some rows with duplicate tweet texts... remove these
tweets.drop_duplicates(subset='text', keep='first', inplace=True)
tweets.shape

(52734, 7)

In [196]:
# remove the time information as we only have stock price data per day
# tweets['timestamp'] = pd.DatetimeIndex(tweets['timestamp']).normalize()
tweets['timestamp'] = tweets['timestamp'].dt.date
tweets.head(2)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets
105,2018-01-01,[],Notts Gaussfest @Papplewick_PS Sat 17th March ...,Extreme Electronics,4,0,5
155,2018-01-01,[],*hits blunt* if you're drunk and put your tesl...,the finesse kid,3,0,4


### Load the stock market data and merge both dataframes

In [197]:
# load stock data
stocks = pd.read_json('../data/processed/stock/stocks_cleaned.json')
stocks.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 106 entries, 2018-01-02 to 2018-06-04
Data columns (total 3 columns):
Open       106 non-null float64
Close      106 non-null float64
PriceUp    106 non-null bool
dtypes: bool(1), float64(2)
memory usage: 2.6 KB


In [198]:
# compare with tweets df
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52734 entries, 105 to 699157
Data columns (total 7 columns):
timestamp    52734 non-null object
hashtags     52734 non-null object
text         52734 non-null object
username     52734 non-null object
likes        52734 non-null int64
replies      52734 non-null int64
retweets     52734 non-null int64
dtypes: int64(3), object(4)
memory usage: 3.2+ MB


In [199]:
# change tweets index to be the timestamp 
# to be able to merge it with the stock market df
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'])
tweets.set_index('timestamp', inplace=True, drop=False)
tweets.sort_index(inplace=True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52734 entries, 2018-01-01 to 2018-06-04
Data columns (total 7 columns):
timestamp    52734 non-null datetime64[ns]
hashtags     52734 non-null object
text         52734 non-null object
username     52734 non-null object
likes        52734 non-null int64
replies      52734 non-null int64
retweets     52734 non-null int64
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 3.2+ MB


In [200]:
# combine both datasets with an inner merge, which will remove days with missing stock prices
data = pd.merge(tweets, stocks, left_index=True, right_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 42865 entries, 2018-01-02 to 2018-06-04
Data columns (total 10 columns):
timestamp    42865 non-null datetime64[ns]
hashtags     42865 non-null object
text         42865 non-null object
username     42865 non-null object
likes        42865 non-null int64
replies      42865 non-null int64
retweets     42865 non-null int64
Open         42865 non-null float64
Close        42865 non-null float64
PriceUp      42865 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 3.3+ MB


In [201]:
data.head(3)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
2018-01-02,2018-01-02,[],Tesla Synchronicity\n\n …,𝙳𝚛. 𝚀𝚞𝚒𝚐𝚕𝚎𝚢,3,0,2,312.0,320.53,True
2018-01-02,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.0,320.53,True
2018-01-02,2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Goo...",Here's how old these companies will be turning...,Imran,53,7,41,312.0,320.53,True


In [202]:
print("Num of data points is: ", data.shape[0])

Num of data points is:  42865


### Observation:
According to Prusa et. al  in "The Effect of Dataset Size on Training Tweet Sentiment Classifiers" (2015) 27.000 instances are the suggested number of tweets to use for sentiment analysis with tweets. Using more data did not significantly increase the performance of four tested classifiers. Therefore we decide to further filter out tweets in the expectation to reduce noise in the data. 

We decide to retain only tweets having at lest 5 retweets, tweets that have many likes or started an intense discussion.

In [203]:
data = data[(data['retweets']>4) | (data['likes']>20) | (data['replies']>10)]
print("Num of data points is: ", data.shape[0])

Num of data points is:  23519


### Group data by days 
... and investigate...

In [204]:
# group data by day
daily_data = data.groupby(data['timestamp'], as_index=True)
daily_data.first()

Unnamed: 0_level_0,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.00,320.53,True
2018-01-03,[Tesla],#Tesla just released record delivery numbers f...,InsideEVs Forum,11,1,5,321.00,317.25,False
2018-01-04,[],Tesla struggles with Model 3 production pic.t...,Automotive News,5,0,5,312.87,314.62,True
2018-01-05,[munilandhttps],Head of Puerto Rico electric utility says they...,Cate Long,17,4,9,316.62,316.58,False
2018-01-08,[],“Bırakın doğruları gelecek söylesin ve herkesi...,[n]Beyin,324,2,86,316.00,336.41,True
...,...,...,...,...,...,...,...,...,...
2018-05-29,[],You know Erin let us forget for a short period...,Darji,23,1,4,278.51,283.76,True
2018-05-30,[],Tesla Autopilot blamed for crash with parked p...,BBC News Technology,11,3,11,283.29,291.72,True
2018-05-31,[Tesla],Weekly #Tesla short update. $TSLA short intere...,Ihor Dusaniwsky,12,4,8,287.21,284.73,False
2018-06-01,[1u],Tesla and Elon Musk face tough questions from ...,Minnesota AFL-CIO,19,0,10,285.86,291.82,True


In [205]:
# count tweets per day to see if they're ok'ish distributed
tweets_per_day = daily_data['text'].count()
tweets_per_day.describe()

count     103.000000
mean      228.339806
std       167.414922
min        26.000000
25%       159.000000
50%       184.000000
75%       235.500000
max      1376.000000
Name: text, dtype: float64

### Observations 
During merging, we've lost 3 of 106 days where stock data is available due to missing relevant tweets on that day. Looking at the mean and std of tweet numbers per day, the tweets most probably were skipped during mining.

In average, there are 228 tweets per day with a minimum of 26 tweets. The standard deviation is quite high too, but since we're so far only looking at individual tweets, this is absolutely ok. Even when we go for averaging the tweets of a single day, it should still be fine.

Another important investigation is how many tweets we have for days with "PriceUp"==True and False?

In [206]:
price_groups = data.groupby(data['PriceUp'])
n_false, n_true = price_groups['text'].count()
print("Number of tweets in both groups: {} and {}".format(n_true, n_false))
print("Percentage of tweets with PriceUp == True: {:.3f}".format(n_true/(n_true+n_false)))

Number of tweets in both groups: 11995 and 11524
Percentage of tweets with PriceUp == True: 0.510


In [207]:
# check how to access all tweets of a single day
# get groups' names
daily_data.groups.keys()
groups = [name for name, _ in daily_data]
groups[0]

Timestamp('2018-01-02 00:00:00')

In [208]:
# get all tweets from the first day
first_day_data = daily_data.get_group(groups[0])
first_day_data.head()

Unnamed: 0,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.0,320.53,True
2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Goo...",Here's how old these companies will be turning...,Imran,53,7,41,312.0,320.53,True
2018-01-02,"[Model3, Autopilot2, pasatealoelectrico, Tesla]","Primera prueba del @Tesla #Model3 en la nieve,...",PasatealoElectrico,23,0,6,312.0,320.53,True
2018-01-02,[],Know the whirr sound a Tesla makes?\n\nThat's ...,Elon Musk News,8,0,5,312.0,320.53,True
2018-01-02,[],"In Norway, @Tesla finished Q4 with 3,753 Model...",Tesla Daily,28,0,6,312.0,320.53,True


### Generate Report and save the merged data

In [209]:
# save the full merged dataset after resetting index
data.reset_index(drop=True, inplace=True)
print("Num of data points is: ", data.shape[0])
pd.set_option('display.max_colwidth', -1)
data.head(3)

Num of data points is:  23519


Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest questions for Elon Musk in 2018: #Muskwatchpic.twitter.com/YQ1q9dHNbs",Nerdist,37,5,10,312.0,320.53,True
1,2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Google, NETFLIX, Amazon, Apple, Disney]",Here's how old these companies will be turning in 2018:\n\n#Snapchat: 7 years\n#Uber: 9 years\n#Twitter: 12 years\n#Facebook: 14 years\n#Tesla: 15 years\n#Google: 20 years\n#NETFLIX: 21 years\n#Amazon: 24 years\n#Apple: 42 years\nIntel: 50 years\nHP: 79 years\n#Disney: 95 years\nIBM: 107 years,Imran,53,7,41,312.0,320.53,True
2,2018-01-02,"[Model3, Autopilot2, pasatealoelectrico, Tesla]","Primera prueba del @Tesla #Model3 en la nieve, incluyendo el #Autopilot2 (Vídeo) \n#pasatealoelectrico #Tesla\n …",PasatealoElectrico,23,0,6,312.0,320.53,True


In [210]:
data.to_json('processed_data/data_final_merged.json')

In [211]:
# print the generated dataframe report
# profile.to_notebook_iframe()