In [210]:
import os
import re
import pandas as pd
from time import time
from datetime import datetime, timedelta
import numpy as np
import nltk
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

RANDOM_SEED = 7
DATA_DIR = "../../../data/"
INTERM_DIR = '../../../data/compiled_data/'

twitter_data_path = os.path.join(DATA_DIR, 'final/tweets_data_final.csv')
replies_data_path = os.path.join(DATA_DIR, 'final/replies_data_final.csv')
bitcoin_data_path = os.path.join(DATA_DIR, 'final/bitcoin_halfhour_data.csv')
sentiment_data_path = os.path.join(DATA_DIR, 'final/sentiment_score.csv')

In [211]:
def load_twitter_data(data_path, nrows=None, cols=['text', 'conversation_id','created_at', 'retweet_count', 'reply_count', 'like_count','quote_count', 'is_reply_to_user', 'related_user_id',]):
    "Load twitter data, nrows None indicates all rows, otherwise specified integer of rows"
    data = pd.read_csv(data_path, nrows = nrows, delimiter=',', usecols=cols)
    data = data[data['text'] != '']
    data['created_at'] = pd.to_datetime(data['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
    return data

In [212]:
def load_bitcoin(data_path):
    asset = pd.read_csv(data_path)
    asset = asset.dropna()
    asset['time'] = pd.to_datetime(asset['time'])
    return asset

In [216]:
twitter_df = load_twitter_data(twitter_data_path)
replies_df = load_twitter_data(replies_data_path)
bitcoin_df = load_bitcoin(bitcoin_data_path)
sentiment = pd.read_csv(sentiment_data_path, nrows = None, delimiter=',')


In [219]:
total_df = pd.concat([twitter_df, replies_df], ignore_index=True)


In [220]:
def weighted_compound(df, drop_ratio=1, o_w=0.7):
    df['ratio'] = 0
    df['w_sa'] = 0
    df['ratio'] = (df[df['is_reply_to_user']==1].groupby(by="conversation_id")['like_count'].transform(lambda x: (x / x.sum())) * df['compound_sa'])
    for item in pd.unique(df['conversation_id']):
        o = df[(df['conversation_id']==item) & (df['is_reply_to_user']==0)]['compound_sa'].sum()
        r = df[(df['conversation_id']==item) & (df['is_reply_to_user']==1)]['ratio'].sum()
        pct = abs((o-r)/o)
        idx = df[(df['conversation_id']==item)].index
        if pct > drop_ratio:
            df.drop(idx)
            continue
        df.loc[idx, 'w_sa'] = (o*o_w) + (r*(1-o_w))
    df.drop(df[(df['is_reply_to_user']==1)].index)
    return df

In [221]:
sentiment = weighted_compound(sentiment, drop_ratio=0.5)

  pct = abs((o-r)/o)
  pct = abs((o-r)/o)


In [230]:
def clustering(df, delta='30T', rate=0.022):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df.set_index('created_at', inplace=True)
    total_df['created_at'] = pd.to_datetime(total_df['created_at'])
    total_df.set_index('created_at', inplace=True)
    features = ['retweet_count', 'reply_count',
        'like_count', 'quote_count', 'w_sa']
    features_df=df.resample(delta)[features].mean()
    features_df['total_tweets'] = total_df.resample('30T').size()
    features_df = features_df.dropna(how='any', axis=0)
    features_df = features_df[abs(features_df['w_sa'].pct_change()) > rate]
    return features_df

In [231]:
features_df = clustering(sentiment)
features_df.head()

KeyError: 'created_at'

In [224]:
def returns_comput(features, bitcoin):
    t_0 = features.index
    t_1 = t_0 + timedelta(hours=2.0)
    bitcoin['time'] = bitcoin['time'].dt.strftime('%Y-%m-%d %H:%M:00')
    t_0 = t_0.strftime('%Y-%m-%d %H:%M:00')
    t_1 = t_1.strftime('%Y-%m-%d %H:%M:00')
    t_all = t_0.append(t_1)
    bitcoin = bitcoin[(bitcoin['time'].isin(t_all))][['time', 'close', 'volume']]
    bitcoin['time'] = pd.to_datetime(bitcoin['time'])
    bitcoin = bitcoin.set_index('time')
    bitcoin['returns'] = bitcoin.pct_change(periods=1, freq='120T')['close']
    bitcoin = bitcoin.dropna(how='any', axis=0)
    bitcoin.index = bitcoin.index - timedelta(hours=2.0)
    return bitcoin

In [225]:
bitcoin = returns_comput(features_df, bitcoin_df)

In [226]:
features_df = pd.merge(features_df, bitcoin, left_index=True, right_index=True)
features_df['binary'] = 1
features_df.loc[features_df.returns < 0, 'binary'] = 0

In [227]:
from numpy.lib.stride_tricks import sliding_window_view
y = features_df['binary']
df = features_df.drop(columns=['binary'])
X = df.to_numpy() #.shift(1).dropna().to_numpy()
window_size = 6
windowed_X = sliding_window_view(np.flip(X, axis=0), window_shape = window_size, axis=0)
windowed_X = np.flip(windowed_X, axis=0)
windowed_X = np.flip(windowed_X, axis=2)

y = y.iloc[window_size:].to_numpy()
print(windowed_X.shape)

(4998, 9, 6)


In [228]:
import sklearn
# print(sklearn.__version__) # make sure > 0.24
X = windowed_X
trainsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.3 * X.shape[0]))

for train_index, rem_index in trainsplit.split(X):
    print(rem_index)
    X_train, X_rem = X[train_index], X[rem_index-1]
    y_train, y_rem = y[train_index], y[rem_index-1]
    
valsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.33 * X_rem.shape[0]))
for val_index, test_index in valsplit.split(X_rem):
    X_val, X_test = X_rem[val_index], X_rem[test_index]
    y_val, y_test = y_rem[val_index], y_rem[test_index]
    
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

[2000 2001 2002 ... 3496 3497 3498]
[3499 3500 3501 ... 4995 4996 4997]
(3493, 9, 6)
(999, 9, 6)
(494, 9, 6)


In [229]:
import pickle as pkl
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'wb') as f:
    pkl.dump([X_train, y_train, X_val, y_val, X_test, y_test], f)

In [236]:
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'rb') as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pkl.load(f)


Build our Model


In [250]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

look_back = 10

model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=256, verbose=2)

trainPredict = model.predict(X_train)
testPredict = model.predict(X_test)

#We have to invert the predictions before calculating error to so that reports will be in same units as our original data

trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([y_train])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([y_test])

trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:, 0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:, 0]))
print('Test Score: %.2f RMSE' % (testScore))

#Train Score: 4.77 RMSE
#Test Score: 5.57 RMSE

# shift train predictions for plotting
trainPredictPlot = np.empty_like(features_df)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict
 

 # shift test predictions for plotting
testPredictPlot = np.empty_like(features_df)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(features_df) - 1, :] = testPredict
 

plt.plot(df['last'], label='Actual')
plt.plot(pd.DataFrame(trainPredictPlot, columns=["close"], index=df.index).close, label='Training')
plt.plot(pd.DataFrame(testPredictPlot, columns=["close"], index=df.index).close, label='Testing')
plt.legend(loc='best')
plt.show()



Epoch 1/100


ValueError: in user code:

    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.9/site-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 1, 10), found shape=(None, 9, 6)


In [None]:
##LSTM MODEL
from math import sqrt
from numpy import concatenate
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot

from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg
