In [2]:
import os
import re
import pandas as pd
from time import time
import datetime
import numpy as np
import nltk
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RANDOM_SEED = 7
DATA_DIR = "../../../data/"
INTERM_DIR = '../compiled_data'

twitter_data_path = os.path.join(DATA_DIR, 'tweets/tweets_data_final.csv')
bitcoin_data_path = os.path.join(DATA_DIR, 'bitcoin_price/archive/BTC-USD.csv')

nlp = spacy.load("en_core_web_sm")

In [4]:
df = pd.read_csv(twitter_data_path)
#pd.read_csv(bitcoin_data_path)


In [5]:
def load_twitter_data(data_path, nrows=None, cols=['text', 'conversation_id','created_at', 'retweet_count', 'reply_count', 'like_count','quote_count', 'is_reply_to_user', 'related_user_id',]):
    "Load twitter data, nrows None indicates all rows, otherwise specified integer of rows"
    data = pd.read_csv(data_path, nrows = nrows, delimiter=',', usecols=cols)
    data = data[data['text'] != '']
    data['created_at'] = pd.to_datetime(data['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
    return data

In [6]:
def load_bitcoin(data_path):
    asset = pd.read_csv(data_path)
    asset = asset.dropna()
    asset["log_ret"] = np.log(asset.prc).diff(1)
    return asset["log_ret"]

In [7]:
twitter_df = load_twitter_data(twitter_data_path, None)
#bitcoin_df = load_bitcoin(bitcoin_data_path, None)



In [8]:
twitter_df.groupby(twitter_df["created_at"].apply(lambda x: datetime.datetime.fromisoformat(x)).dt.day)[['text', 'is replies']].count().plot(kind="bar")

KeyError: "Columns not found: 'is replies'"

In [None]:
def rem_url_at(df):
    raw = []
    for sentence in tqdm(df['text']):
        sentence = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                          '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
        sentence = re.sub("(@[A-Za-z0-9_]+)", "", sentence)
        raw.append(sentence)
    df['text'] = raw
    return df

In [None]:
def tagging(df):
    pos,tag,dep,shape = [],[],[],[]
    for sentence in tqdm(df['text']):
        pos_tmp,tag_tmp,dep_tmp,shape_tmp = [],[],[],[]
        for token in nlp(sentence):
            pos_tmp.append(token.pos_)
            tag_tmp.append(token.tag_)
            dep_tmp.append(token.dep_)
            shape_tmp.append(token.shape_)
        pos.append(pos_tmp)
        tag.append(tag_tmp)
        dep.append(dep_tmp)
        shape.append(shape_tmp)
    df['Pos'] = pos
    df['Tag'] = tag
    df['Dep'] = dep
    df['Shape'] = shape
    return df

In [None]:
twitter_df = rem_url_at(twitter_df)
twitter_df = tagging(twitter_df)
twitter_df.replace('', np.nan, inplace=True)
twitter_df = twitter_df.dropna(how='any', axis=0)
twitter_df = twitter_df[twitter_df['text'].map(
    lambda d: len(d)) > 0]

100%|██████████| 3678/3678 [00:00<00:00, 252490.26it/s]
100%|██████████| 3678/3678 [00:45<00:00, 80.80it/s] 


In [9]:
twitter_df[:1]

Unnamed: 0,text,conversation_id,created_at,retweet_count,reply_count,like_count,quote_count,is_reply_to_user,related_user_id
0,If your long term conviction of bitcoin’s valu...,1525612785375264768,2022-05-14 23:03:27,378,196,2546,42,0,339061487


## Sentiment Analyser
- Flair
- Vader
- Blob

In [None]:
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')

  from .autonotebook import tqdm as notebook_tqdm


2022-05-24 17:23:24,718 loading file /Users/ade/.flair/models/sentiment-en-mix-distillbert_4.pt


In [30]:
def sentiment_analyser(df):
    df_vader = []
    for sentence in tqdm(df['text'], position=0):
        df_vader.append(vader(sentence))
    sa_df = pd.DataFrame(df_vader)
    print(df.index, sa_df)
    sa_df.index = df.index
    df['neg_sa'] = sa_df['neg']
    df['pos_sa'] = sa_df['pos']
    df['neu_sa'] = sa_df['neu']
    df['compound_sa'] = sa_df['compound']
    return df

In [19]:
def vader(sentence):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(sentence)
    return sentiment

In [None]:
def blob(sentence):
    sentiment = TextBlob(sentence).sentiment.polarity
    return sentiment

In [None]:
def fler(sentence):
    s = Sentence(sentence)
    classifier.predict(s)
    sentiment = str(s.labels[0])
    num = float(re.findall(r'\d+\.\d+', sentiment)[0])
    if sentiment.find('POSITIVE') == -1:
        num = num * -1
    return num

In [None]:
## Moved away from using multiple senttiment analysis models, too much computation time, Vader most adapted to social media
# def sentiment_analyser(df, v=True, f=False, b=False, r_vader=0.8, r_fler=0.1, r_blob=0.1):
#     df_fler, df_vader, df_blob = [], [], []
#     for sentence in tqdm(df['body'], position=0):
#         if v:
#             df_vader.append(vader(sentence))
#         if f:
#             df_fler.append(fler(sentence))
#         if b:
#             df_blob.append(blob(sentence))
#     if v: 
#         df['VADER'] = df_vader
#     if f: 
#         df['FLAIR'] = df_fler
#     if b:
#         df['BLOB'] = df_blob
#     if v and f and b:
#         df['compound'] = df['VADER']*r_vader + \
#             df['FLAIR']*r_fler + df['BLOB']*r_blob
#     elif v and f and not b:
#         df['compound'] = df['VADER']*(r_vader+r_blob) + df['FLAIR']*r_fler
#     elif v and b and not f:
#         df['compound'] = df['VADER']*(r_vader+r_fler) + df['BLOB']*r_blob
#     else:
#         df['compound'] = df['VADER']
#     return df

In [14]:
twitter_df['text'][1]

"We have helped nearly 1,000 people get a new job in the bitcoin and crypto industry over the last year. \n\nWe don't plan on stopping any time soon. \n\nThere are hundreds of open roles at the top companies in the industry. Come help build the future.\n\nAPPLY: https://t.co/EaWrk2lCb3"

In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer


In [13]:
analyzer = SentimentIntensityAnalyzer()


In [20]:
sentiment = analyzer.polarity_scores(twitter_df['text'][1])

In [23]:
twitter_df.loc[1]

text                We have helped nearly 1,000 people get a new j...
conversation_id                                   1525490438093799424
created_at                                        2022-05-14 14:57:17
retweet_count                                                      50
reply_count                                                        87
like_count                                                        331
quote_count                                                         4
is_reply_to_user                                                    0
related_user_id                                             339061487
Name: 1, dtype: object

In [31]:
twitter_df = sentiment_analyser(twitter_df[:1])

100%|██████████| 279/279 [00:02<00:00, 117.18it/s]


Index(['text'], dtype='object')      neg  neu  pos  compound
0    0.0  0.0  0.0       0.0
1    0.0  0.0  0.0       0.0
2    0.0  0.0  0.0       0.0
3    0.0  0.0  0.0       0.0
4    0.0  0.0  0.0       0.0
..   ...  ...  ...       ...
274  0.0  0.0  0.0       0.0
275  0.0  0.0  0.0       0.0
276  0.0  0.0  0.0       0.0
277  0.0  0.0  0.0       0.0
278  0.0  0.0  0.0       0.0

[279 rows x 4 columns]


ValueError: Length mismatch: Expected axis has 279 elements, new values have 1 elements

In [29]:
twitter_df

text               We have helped nearly 1,000 people get a new j...
conversation_id                                  1525490438093799424
created_at                                       2022-05-14 14:57:17
retweet_count                                                     50
reply_count                                                       87
neg_sa             0      0.0
1      0.0
2      0.0
3      0.0
4 ...
pos_sa             0      0.0
1      0.0
2      0.0
3      0.0
4 ...
neu_sa             0      0.0
1      0.0
2      0.0
3      0.0
4 ...
compound_sa        0      0.0
1      0.0
2      0.0
3      0.0
4 ...
Name: 1, dtype: object

In [114]:
import multiprocessing

tmp = twitter_df[:10]

n_splits = 8
n_rows = len(tmp)
chunks = [[int(i * n_rows/n_splits), int((i+1) * n_rows/n_splits)]  for i in range(n_splits)]

# function to perform vader analysis on portion of the table
def vader_worker(row_range):    
    return sentiment_analyser(tmp.iloc[row_range[0]:row_range[1]])

p = multiprocessing.Pool(processes=n_splits) 
tmp = p.map(vader_worker, chunks)
p.close() 
del p

tmp = pd.concat(tmp)
#twitter_df.to_csv(INTERM_DIR+'/XXX/XX')

In [None]:
bitcoin_df['price'].plot(figsize=(20,10))

In [None]:
# minmax scale it
bitcoin_df['returns'] = (bitcoin_df['returns']-bitcoin_df['returns'].min()) / (bitcoin_df['returns'].max()-bitcoin_df['returns'].min())
bitcoin_df['returns'].plot(figsize=(20,10))

In [None]:
bull_lexicon = ['buy','call','forward','long','up','grow','rise','green','hold','carry','bull']
bear_lexicon = ['short','sell','down','drop','decrease','red','bear']

In [None]:
twitter_df['bull_count'] = twitter_df['text'].apply(lambda x: any(substring in x for substring in bull_lexicon))
twitter_df['bear_count'] = twitter_df['text'].apply(lambda x: any(substring in x for substring in bear_lexicon))

In [None]:
counts_per_day = twitter_df.reset_index().groupby('date')['index'].count()
counts_per_day.plot(figsize=(20,10))

In [None]:
twitter_df['score'] = twitter_df['like_count'].astype(int)
twitter_df['score'] = twitter_df[['score']].apply(lambda x: x.fillna(x.mean()))
twitter_df['abs_score'] = np.abs(twitter_df['score'])

#Clustering
twitter_df = pd.merge(twitter_df.drop('abs_score', axis=1), 
                                 twitter_df.groupby('date')['abs_score'].sum(), left_on='date', right_index=True)

exp_scalar = 1.1
# Normalize score by total (absolute) score for that day
twitter_df['score_w'] = twitter_df['score']**exp_scalar /twitter_df['abs_score']
twitter_df = twitter_df.drop(['abs_score'], axis=1)
twitter_df['compound_sa'] = twitter_df['compound_sa'] * twitter_df['score_w']
twitter_df['neg_sa'] = twitter_df['neg_sa'] * twitter_df['score_w']
twitter_df['pos_sa'] = twitter_df['pos_sa'] * twitter_df['score_w']
twitter_df['neu_sa'] = twitter_df['neu_sa'] * twitter_df['score_w']
twitter_df['bull_count'] = twitter_df['bull_count'] * twitter_df['score_w']
twitter_df['bear_count'] = twitter_df['bear_count'] * twitter_df['score_w']

In [None]:
twitter_df = twitter_df.groupby('date')[['score','compound_sa','neg_sa','pos_sa','neu_sa','bear_count','bull_count']].agg(
    {'compound_sa' : ['sum'], 
     'neg_sa' : ['sum'], 
     'neu_sa' : ['sum'], 
     'pos_sa' : ['sum'],
     'bear_count' : ['sum'],
     'bull_count' : ['sum'],
     'score' : ['mean']}).reset_index()

In [None]:
twitter_df.plot(x = 'date', y = 'compound_sa', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'neg_sa', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'pos_sa', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'neu_sa', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'bear_count', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'bull_count', figsize=(20,10))
plt.show()

In [None]:
twitter_df.plot(x = 'date', y = 'score', figsize=(20,10))
plt.show()
twitter_df = twitter_df.drop('score', axis=1)

In [None]:
### Fast Fourier Transform on Sentiment Analyses

In [None]:
from scipy.fftpack import fft, ifft

def fourier(df, n_dimensions, col):
    for n in n_dimensions:
        n = round(n)
        tmp_ = fft(df[col].values)
        tmp_[n:-n] = 0
        df[str(n)+'fourier'+col] = np.abs(ifft(tmp_))
    return df

# how to choose the values of fourier?
l = len(twitter_df)
twitter_df = fourier(twitter_df, [100, 200], 'compound_sa_sum')
twitter_df = fourier(twitter_df, [100, 200], 'pos_sa_sum')
twitter_df = fourier(twitter_df, [100, 200], 'neg_sa_sum')
twitter_df = fourier(twitter_df, [100, 200], 'neu_sa_sum')
returns = fourier(twitter_df['returns'], [200, 400], 'price')

In [None]:
twitter_df[['compound_sa_sum','100fouriercompound_sa_sum','200fouriercompound_sa_sum']].plot(figsize=(20,10))

In [None]:
twitter_df[['pos_sa_sum','100fourierpos_sa_sum','200fourierpos_sa_sum']].plot(figsize=(20,10))

In [None]:
twitter_df[['neg_sa_sum','100fourierneg_sa_sum','200fourierneg_sa_sum']].plot(figsize=(20,10))

In [None]:
twitter_df[['neu_sa_sum','100fourierneu_sa_sum','200fourierneu_sa_sum']].plot(figsize=(20,10))

In [None]:
returns[['SPY_Open','200fourierSPY_Open','400fourierSPY_Open']].plot(figsize=(20,10))

In [None]:
twitter_df = twitter_df.drop(['compound_sa_sum','pos_sa_sum','neg_sa_sum','neu_sa_sum'], axis=1)

In [None]:
def merged_df(df, df_tickers):
    final_df = pd.merge(df, df_tickers, left_on='date_', right_on='Date')
    final_df = final_df.set_index('date_')
    return final_df

final_df = merged_df(twitter_df, returns)
#final_df.to_pickle(INTERM_DIR+f'/clean/Final_df_pkl')

In [None]:
final_df.head()

In [None]:
from numpy.lib.stride_tricks import sliding_window_view
y = bitcoin_df['prices']
# change between today and tomorrow is today's predictor
y = y.pct_change()[1:].apply(lambda x: 0 if x < 0 else 1).shift(-1)
y = y[y.index.isin(final_df.index)]
# the information throughout today is only known tomorrow ???
# investment decision is made before the open ??
X = final_df.shift(1).dropna().to_numpy()
window_size = 6
## No need to normalize, normalization happens due to feature engineering and bitcoin returns already normalized
# Create windows
# flip and flip back to make it a reverse window (t-2, t-3... t-window_size)
windowed_X = sliding_window_view(np.flip(X, axis=0), window_shape = window_size, axis=0)
windowed_X = np.flip(windowed_X, axis=0)
windowed_X = np.flip(windowed_X, axis=2)

y = y.iloc[window_size:].to_numpy()
print(windowed_X.shape)

### Splitting

In [None]:
import sklearn
# print(sklearn.__version__) # make sure > 0.24

X = windowed_X

trainsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.3 * X.shape[0]))

for train_index, rem_index in trainsplit.split(X):

    X_train, X_rem = X[train_index], X[rem_index]
    y_train, y_rem = y[train_index], y[rem_index]
    

valsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.33 * X_rem.shape[0]))
for val_index, test_index in valsplit.split(X_rem):

    X_val, X_test = X_rem[val_index], X_rem[test_index]
    y_val, y_test = y_rem[val_index], y_rem[test_index]
    
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
import pickle as pkl
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'wb') as f:
    pkl.dump([X_train, y_train, X_val, y_val, X_test, y_test], f)

In [None]:
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'rb') as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pkl.load(f)