In [210]:
import os
import re
import pandas as pd
from time import time
from datetime import datetime, timedelta
import numpy as np
import nltk
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

RANDOM_SEED = 7
DATA_DIR = "../../../data/"
INTERM_DIR = '../../../data/compiled_data/'

twitter_data_path = os.path.join(DATA_DIR, 'final/tweets_data_final.csv')
replies_data_path = os.path.join(DATA_DIR, 'final/replies_data_final.csv')
bitcoin_data_path = os.path.join(DATA_DIR, 'final/bitcoin_halfhour_data.csv')
sentiment_data_path = os.path.join(DATA_DIR, 'final/sentiment_score.csv')

In [211]:
def load_twitter_data(data_path, nrows=None, cols=['text', 'conversation_id','created_at', 'retweet_count', 'reply_count', 'like_count','quote_count', 'is_reply_to_user', 'related_user_id',]):
    "Load twitter data, nrows None indicates all rows, otherwise specified integer of rows"
    data = pd.read_csv(data_path, nrows = nrows, delimiter=',', usecols=cols)
    data = data[data['text'] != '']
    data['created_at'] = pd.to_datetime(data['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
    return data

In [212]:
def load_bitcoin(data_path):
    asset = pd.read_csv(data_path)
    asset = asset.dropna()
    asset['time'] = pd.to_datetime(asset['time'])
    return asset

In [216]:
twitter_df = load_twitter_data(twitter_data_path)
replies_df = load_twitter_data(replies_data_path)
bitcoin_df = load_bitcoin(bitcoin_data_path)
sentiment = pd.read_csv(sentiment_data_path, nrows = None, delimiter=',')


In [219]:
total_df = pd.concat([twitter_df, replies_df], ignore_index=True)


In [220]:
def weighted_compound(df, drop_ratio=1, o_w=0.7):
    df['ratio'] = 0
    df['w_sa'] = 0
    df['ratio'] = (df[df['is_reply_to_user']==1].groupby(by="conversation_id")['like_count'].transform(lambda x: (x / x.sum())) * df['compound_sa'])
    for item in pd.unique(df['conversation_id']):
        o = df[(df['conversation_id']==item) & (df['is_reply_to_user']==0)]['compound_sa'].sum()
        r = df[(df['conversation_id']==item) & (df['is_reply_to_user']==1)]['ratio'].sum()
        pct = abs((o-r)/o)
        idx = df[(df['conversation_id']==item)].index
        if pct > drop_ratio:
            df.drop(idx)
            continue
        df.loc[idx, 'w_sa'] = (o*o_w) + (r*(1-o_w))
    df.drop(df[(df['is_reply_to_user']==1)].index)
    return df

In [221]:
sentiment = weighted_compound(sentiment, drop_ratio=0.5)

  pct = abs((o-r)/o)
  pct = abs((o-r)/o)


In [199]:
def clustering(df, delta='30T', rate=0.022):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df.set_index('created_at', inplace=True)
    total_df['created_at'] = pd.to_datetime(total_df['created_at'])
    total_df.set_index('created_at', inplace=True)
    features = ['retweet_count', 'reply_count',
        'like_count', 'quote_count', 'w_sa']
    features_df=df.resample(delta)[features].mean()
    features_df['total_tweets'] = total_df.resample('30T').size()
    features_df = features_df.dropna(how='any', axis=0)
    features_df = features_df[abs(features_df['w_sa'].pct_change()) > rate]
    return features_df

In [201]:
features_df = clustering(sentiment)
features_df.head()

KeyError: "Columns not found: 'w_sa'"

In [125]:
def returns_comput(features, bitcoin):
    t_0 = features.index
    t_1 = t_0 + timedelta(hours=2.0)
    bitcoin['time'] = bitcoin['time'].dt.strftime('%Y-%m-%d %H:%M:00')
    t_0 = t_0.strftime('%Y-%m-%d %H:%M:00')
    t_1 = t_1.strftime('%Y-%m-%d %H:%M:00')
    t_all = t_0.append(t_1)
    bitcoin = bitcoin[(bitcoin['time'].isin(t_all))][['time', 'close', 'volume']]
    bitcoin['time'] = pd.to_datetime(bitcoin['time'])
    bitcoin = bitcoin.set_index('time')
    bitcoin['returns'] = bitcoin.pct_change(periods=1, freq='120T')['close']
    bitcoin = bitcoin.dropna(how='any', axis=0)
    bitcoin.index = bitcoin.index - timedelta(hours=2.0)
    return bitcoin

In [126]:
bitcoin = returns_comput(features_df, bitcoin_df)

In [127]:
features_df = pd.merge(features_df, bitcoin, left_index=True, right_index=True)
features_df['binary'] = 1
features_df.loc[features_df.returns < 0, 'binary'] = 0

In [129]:
from numpy.lib.stride_tricks import sliding_window_view
y = features_df['binary']
df = features_df.drop(columns=['binary'])
X = df.to_numpy() #.shift(1).dropna().to_numpy()
window_size = 6
windowed_X = sliding_window_view(np.flip(X, axis=0), window_shape = window_size, axis=0)
windowed_X = np.flip(windowed_X, axis=0)
windowed_X = np.flip(windowed_X, axis=2)

y = y.iloc[window_size:].to_numpy()
print(windowed_X.shape)

ValueError: window shape cannot be larger than input array shape

In [None]:
import sklearn
# print(sklearn.__version__) # make sure > 0.24
X = windowed_X
trainsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.3 * X.shape[0]))

for train_index, rem_index in trainsplit.split(X):
    print(rem_index)
    X_train, X_rem = X[train_index], X[rem_index-1]
    y_train, y_rem = y[train_index], y[rem_index-1]
    
valsplit = sklearn.model_selection.TimeSeriesSplit(n_splits=2, gap = window_size, test_size = int(0.33 * X_rem.shape[0]))
for val_index, test_index in valsplit.split(X_rem):
    X_val, X_test = X_rem[val_index], X_rem[test_index]
    y_val, y_test = y_rem[val_index], y_rem[test_index]
    
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

[12 13 14 15 16 17 18]
[19 20 21 22 23 24 25]
(17, 9, 2)
(3, 9, 2)
(2, 9, 2)


In [None]:
import pickle as pkl
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'wb') as f:
    pkl.dump([X_train, y_train, X_val, y_val, X_test, y_test], f)

In [None]:
with open(os.path.join(INTERM_DIR, 'train_data.pkl'), 'rb') as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pkl.load(f)