### Merge tweets and stock price dataframes
Load all cleaned tweets, which are saved in separate files, load all stock prices and combine them in single dataframe grouped by date.

In [142]:
import os
import pandas as pd
# reset colwitdth options when running all cells 
pd.reset_option('display.max_colwidth')
pd.__version__

'0.25.3'

In [143]:
# read in all the tweets
tweets_dfs = []
directory = '../data/processed/strict/'
for subdir, dirs, files in os.walk(directory):
    for file in files:
        tweets_dfs.append(pd.read_json(directory+file))
        
tweets = pd.concat(tweets_dfs)

In [144]:
tweets

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets
165,2018-01-01 23:15:29,[Tesla],“Precisely one of the most gratifying results ...,Á̵̢̙͉̫ 𝐫 𝐝 𝐢 𝐧 ≤≥ 🌐,16,1,1
489,2018-01-01 21:52:27,"[NewYearsDay, NewYearsEve2017, HappyNewYear, T...",It’s 2018. We have self driving cars but stil...,Nicholas E. Calhoun,2,1,1
527,2018-01-01 21:42:16,"[Tesla, TeslaMotors, ElectricVehicle, ElonMusk...",#Tesla Powerwall initiative in Vermont is grow...,EVANNEX for Tesla,33,1,8
802,2018-01-01 20:54:24,"[Ganador, Gemelos, Tesla, enhorabuena]",#Ganador #Gemelos #Tesla @Isilopez_101 #enhora...,MQE Más Que Eléctricos,5,1,2
1087,2018-01-01 20:00:00,[Tesla],#Tesla hat bislang nur Verluste angehäuft. Ist...,Frankfurter Allgemeine,6,3,2
...,...,...,...,...,...,...,...
697922,2018-04-02 02:27:33,"[Lithium, ASX, dow, finance, Cobalt, Tesla, tr...",$GPP CRTuresi #Lithium project rig expected t...,SeeThru👁,3,1,1
698520,2018-04-02 01:11:52,"[Spaceman, Tiangong, Tiangong1, stazionespazia...","Hey #Spaceman, did you see the #Tiangong? \n#T...",Massimo Guerrera,19,2,4
698544,2018-04-02 01:08:28,"[ElonMusk, Tesla]",#ElonMusk sends April Fool's Day tweets joking...,The Straits Times,20,1,8
698814,2018-04-02 00:36:37,"[electric, car, electriccar, ev, tesla, autono...",electric car via NodeXL \n@sufiy\n@amazingche...,NodeXL Pro,4,1,2


In [145]:
# uncomment lines below to generate a dataframe report
# import pandas_profiling as profile
# profile = profile.ProfileReport(tweets, title='Daily Tweets Profiling Report', html={'style':{'full_width':True}})
# profile.to_file(output_file="tweets_df_report.html")

In [146]:
# the dataframe report has shown that there are some rows with duplicate tweet texts... remove these
tweets.drop_duplicates(subset='text', keep='first', inplace=True)
tweets.shape

(6266, 7)

In [147]:
# remove the time information as we only have stock price data per day
# tweets['timestamp'] = pd.DatetimeIndex(tweets['timestamp']).normalize()
tweets['timestamp'] = tweets['timestamp'].dt.date
tweets.head(2)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets
165,2018-01-01,[Tesla],“Precisely one of the most gratifying results ...,Á̵̢̙͉̫ 𝐫 𝐝 𝐢 𝐧 ≤≥ 🌐,16,1,1
489,2018-01-01,"[NewYearsDay, NewYearsEve2017, HappyNewYear, T...",It’s 2018. We have self driving cars but stil...,Nicholas E. Calhoun,2,1,1


### Load the stock market data and merge both dataframes

In [148]:
# load stock data
stocks = pd.read_json('../data/processed/stock/stocks_cleaned.json')
stocks.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 106 entries, 2018-01-02 to 2018-06-04
Data columns (total 3 columns):
Open       106 non-null float64
Close      106 non-null float64
PriceUp    106 non-null bool
dtypes: bool(1), float64(2)
memory usage: 2.6 KB


In [149]:
# compare with tweets df
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6266 entries, 165 to 699232
Data columns (total 7 columns):
timestamp    6266 non-null object
hashtags     6266 non-null object
text         6266 non-null object
username     6266 non-null object
likes        6266 non-null int64
replies      6266 non-null int64
retweets     6266 non-null int64
dtypes: int64(3), object(4)
memory usage: 391.6+ KB


In [150]:
# change tweets index to be the timestamp to be able to merge it with the stock market df
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'])
tweets.set_index('timestamp', inplace=True)
tweets.sort_index(inplace=True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6266 entries, 2018-01-01 to 2018-06-04
Data columns (total 7 columns):
timestamp    6266 non-null datetime64[ns]
hashtags     6266 non-null object
text         6266 non-null object
username     6266 non-null object
likes        6266 non-null int64
replies      6266 non-null int64
retweets     6266 non-null int64
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 391.6+ KB


In [151]:
# combine both datasets with an inner merge, which will remove days with missing stock prices
data = pd.merge(tweets, stocks, left_index=True, right_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4949 entries, 2018-01-02 to 2018-06-04
Data columns (total 10 columns):
timestamp    4949 non-null datetime64[ns]
hashtags     4949 non-null object
text         4949 non-null object
username     4949 non-null object
likes        4949 non-null int64
replies      4949 non-null int64
retweets     4949 non-null int64
Open         4949 non-null float64
Close        4949 non-null float64
PriceUp      4949 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 391.5+ KB


In [152]:
data.head(2)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
2018-01-02,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.0,320.53,True
2018-01-02,2018-01-02,"[Tesla, P90D, Blog, Youtube]",Ya estamos en @louesfera probando un #Tesla #P...,Fco Javier,2,1,2,312.0,320.53,True


### Save the merged data

In [157]:
# save the full merged dataset
data.reset_index(inplace=True)
data.to_json('../data/processed/data_merged.json')
data.head(2)

Unnamed: 0,level_0,index,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,0,2018-01-02,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.0,320.53,True
1,1,2018-01-02,2018-01-02,"[Tesla, P90D, Blog, Youtube]",Ya estamos en @louesfera probando un #Tesla #P...,Fco Javier,2,1,2,312.0,320.53,True


### 