In [1]:
import pandas as pd
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.set_option('display.max_rows', 10000)

In [2]:
from datetime import datetime
from functools import reduce
import re

def twitter_dataframe(account):
    tweets = pd.read_json("../data/raw/tweets/{}_historical.json".format(account))
    headline_arr = []
    date_arr = []
    
    for post in range(tweets["headline"].count()):
        headline_arr.append(tweets["headline"].iloc[post])
        date_arr.append(tweets["date"].iloc[post])
    tweets_df = pd.DataFrame({"Time": date_arr, "Post": headline_arr})
    
    tweets_df["Post"] = clean_tweets(tweets_df["Post"])
    tweets_df = generate_sentiment_score(tweets_df)
    tweets_df["Time"] = tweets_df["Time"].dt.strftime("%Y-%m-%d %H:%M:00")
    
    return tweets_df

def generate_sentiment_score(tweets):
    sid = SentimentIntensityAnalyzer()
    score = []
    for post in tweets["Post"]:
        score.append(sid.polarity_scores(post)["compound"])
    tweet_score = pd.DataFrame({"Twitter_Sentiment": score})
    tweets["Twitter_Sentiment"] = tweet_score
    return tweets

def clean_tweets(tweets):
    tweets = np.vectorize(remove_pattern) (tweets, "RT @[\w]*:")
    tweets = np.vectorize(remove_pattern) (tweets, "@[\w]*")
    tweets = np.vectorize(remove_pattern) (tweets, "https?://[A-Za-z0-9./]*")
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    tweets = np.core.defchararray.replace(tweets, "\n", " ")
    return tweets

def remove_pattern(input_text, pattern):
    r = re.findall(pattern, input_text)
    for i in r:
        input_text = re.sub(i, "", input_text)
    return input_text

def tweets_merge(tweet_list):
    if len(tweet_list) == 0:
        return
    elif len(tweet_list) == 1:
        return tweet_list[0]
    else:
        merged_tweets = reduce(lambda left, right : pd.merge(left, right, how="outer", on=["Time", "Post", "Twitter_Sentiment"]), tweet_list)     
        merged_tweets.sort_values(by=["Time"], inplace=True)
        merged_tweets = merged_tweets.reset_index(drop=True)
        return merged_tweets

forex_com = twitter_dataframe("forexcom")
ft_markets = twitter_dataframe("FTMarkets")
bloomberg = twitter_dataframe("markets")
reuters = twitter_dataframe("ReutersGMF")
wsj = twitter_dataframe("WSJmarkets")
fx_street_1 = twitter_dataframe("FXstreetNews")
fx_street_2 = twitter_dataframe("FXstreetNews2")
tweets = tweets_merge([forex_com, ft_markets, bloomberg, reuters, wsj, fx_street_1, fx_street_2])

In [3]:
tweets

Unnamed: 0,Time,Post,Twitter_Sentiment
0,2018-01-01 00:12:00,Here are some of the biggest winners and loser...,-0.0772
1,2018-01-01 11:04:00,How high-frequency trading hit a speed bump,0.0000
2,2018-01-01 13:19:00,Cryptocurrencies: debased coinages,0.0000
3,2018-01-01 21:00:00,Crude Oil Price Forecast 2018: Rally likely to...,-0.5719
4,2018-01-01 22:18:00,Option expiries for today's NY cut By #Curre...,-0.2732
...,...,...,...
182220,2020-12-30 22:57:00,NZD/USD consolidates at annual highs above 0.7...,0.0000
182221,2020-12-30 23:17:00,New investment by SoftBank allows construction...,-0.2960
182222,2020-12-30 23:18:00,USTR: Additional tariffs target products from ...,0.0000
182223,2020-12-30 23:25:00,USD/CAD Price Analysis: Bears’ shouldn’t ignor...,-0.4767


In [4]:
print(tweets["Twitter_Sentiment"].mean())
print(tweets["Twitter_Sentiment"].isin([0]).sum()/tweets["Twitter_Sentiment"].count())

0.002999844697489368
0.3703114281794485


In [5]:
count = 0
for post in tweets["Post"]:
    if "jpy" in post.lower():
        count += 1
print(count)

10638


In [11]:
def currency_sentiment(currencies_dict):
    country_df = pd.DataFrame()
    for currency in currencies_dict:
        for entity in currencies_dict[currency]["positive"]:
            tweet_lower = tweets["Post"].transform(lambda post : post.lower())
            currency_df = tweets[tweet_lower.str.contains(entity)]
            currency_df = currency_df[{"Time", "Twitter_Sentiment"}]
            currency_df = currency_df.rename(columns={"Twitter_Sentiment": currency.upper()})
            if country_df.empty:
                country_df = currency_df
            elif not currency.upper() in country_df.columns:
                country_df = country_df.merge(currency_df, how="outer", on="Time")
            else:
                country_df = country_df.merge(currency_df, how="outer", on=["Time", currency.upper()])
        for entity in currencies_dict[currency]["negative"]:
            tweet_lower = tweets["Post"].transform(lambda post : post.lower())
            currency_df = tweets[tweet_lower.str.contains(entity)]
            currency_df = currency_df[{"Time", "Twitter_Sentiment"}]
            currency_df["Twitter_Sentiment"] = currency_df["Twitter_Sentiment"].transform(lambda score : -score)
            currency_df = currency_df.rename(columns={"Twitter_Sentiment": currency.upper()})
            if country_df.empty:
                country_df = currency_df
            elif not currency.upper() in country_df.columns:
                country_df = country_df.merge(currency_df, how="outer", on="Time")
            else:
                country_df = country_df.merge(currency_df, how="outer", on=["Time", currency.upper()])
                
    time_frame = pd.date_range(start="2018-01-01 22:00:00", freq="1T", end="2020-12-31 21:59:00")
    time_frame = pd.DataFrame(time_frame, columns=["Time"])
    time_frame["Time"] = time_frame["Time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    country_df = country_df.reset_index(drop=True)
    country_df = combine_dates(country_df)
    
    country_df = time_frame.merge(country_df, how="left", on="Time")
    country_df = country_df.sort_values(by='Time', ascending=True)
    
    for currency in currencies_dict:
        country_df[currency.upper()] = country_df[currency.upper()].rolling(1440, min_periods=1).mean()
    country_df = country_df.fillna(0)
    
    return country_df

def combine_dates(tweets):
    currencies = ["eur", "usd", "jpy", "cad", "gbp", "aud", "nzd", "chf"]
    length = 1
    for i in range(1, len(tweets.index)):
        current = tweets.at[i, "Time"]
        if current == tweets.at[i - length, "Time"] and i == len(tweets.index) - 1:
            for currency in currencies:
                tweets.at[i - length, currency.upper()] = tweets[currency.upper()].iloc[i - length: i].mean()
        elif current == tweets.at[i - length, "Time"]:
            length += 1
        elif length > 1:
            for currency in currencies:
                tweets.at[i - length, currency.upper()] = tweets[currency.upper()].iloc[i - length: i].mean()
            length = 1
    tweets.drop_duplicates(subset=["Time"], inplace=True)
    return tweets
            
currencies = {
              "usd": {"positive": ["usd/", "u.s.", "greenback", "buck", "barnie", "america", "united states"], "negative": ["/usd", "cable"]},
              "aud": {"positive": ["aud/", "gold", "aussie", "australia"], "negative": ["/aud"]}, 
              "gbp": {"positive": ["gbp/", "sterling", "pound", "u.k.", "united kingdom", "cable", "guppy"], "negative": ["/gbp"]},
              "nzd": {"positive": ["nzd/", "gold", "kiwi", "new zealand"], "negative": ["/nzd"]},
              "cad": {"positive": ["cad/", "oil", "loonie", "canada"], "negative": ["/cad"]},
              "chf": {"positive": ["chf/", "swiss"], "negative": ["/chf"]},
              "jpy": {"positive": ["jpy/", "asian", "japan"], "negative": ["/jpy", "guppy"]},
              "eur": {"positive": ["eur/", "fiber", "euro"], "negative": ["/eur"]}
             }
currency_sentiment = currency_sentiment(currencies)
currency_sentiment

Unnamed: 0,Time,USD,AUD,GBP,NZD,CAD,CHF,JPY,EUR
0,2018-01-01 22:00:00,0.0000,0.00000,0.0000,0.00000,0.0000,0.0,0.0,0.0
1,2018-01-01 22:01:00,0.0000,0.00000,0.0000,0.00000,0.0000,0.0,0.0,0.0
2,2018-01-01 22:02:00,0.0000,0.00000,0.0000,0.00000,0.0000,0.0,0.0,0.0
3,2018-01-01 22:03:00,0.0000,0.00000,0.0000,0.00000,0.0000,0.0,0.0,0.0
4,2018-01-01 22:04:00,0.0000,0.00000,0.0000,0.00000,0.0000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1576795,2020-12-31 21:55:00,-0.4091,0.20095,0.7506,0.20095,0.1589,0.0,0.0,0.0
1576796,2020-12-31 21:56:00,-0.4091,0.20095,0.7506,0.20095,0.1589,0.0,0.0,0.0
1576797,2020-12-31 21:57:00,-0.4091,0.20095,0.7506,0.20095,0.1589,0.0,0.0,0.0
1576798,2020-12-31 21:58:00,-0.4091,0.20095,0.7506,0.20095,0.1589,0.0,0.0,0.0


In [12]:
currency_sentiment[:10000]

Unnamed: 0,Time,USD,AUD,GBP,NZD,CAD,CHF,JPY,EUR
0,2018-01-01 22:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-01-01 22:01:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-01-01 22:02:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-01-01 22:03:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-01-01 22:04:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2018-01-01 22:05:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2018-01-01 22:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2018-01-01 22:07:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2018-01-01 22:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2018-01-01 22:09:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
d = {'col1': [1, 2, 3, np.nan, 4, 5, 6, 7, 8, 9]}
df = pd.DataFrame(data=d)
df['col1'].rolling(2, min_periods=1).mean()

0    1.0
1    1.5
2    2.5
3    3.0
4    4.0
5    4.5
6    5.5
7    6.5
8    7.5
9    8.5
Name: col1, dtype: float64

In [13]:
currency_sentiment = currency_sentiment.reset_index(drop=True)
for i in range(1, len(currency_sentiment.index)):
    previous = currency_sentiment.at[i - 1, "Time"]
    current = currency_sentiment.at[i, "Time"]
    if previous == current:
        print(i, currency_sentiment.at[i, "Time"])

In [14]:
currency_sentiment.to_csv("../data/processed/tweets/tweets_sentiment.csv", index=False)