In [1]:
import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm

import pandas as pd

device = 'cuda' if cuda.is_available() else 'cpu'
device

  torch.utils._pytree._register_pytree_node(


'cuda'

### Config

In [2]:
# Model Config
MODEL = "bert-base-uncased"
MAX_LEN = 128 # This should be changed to max token length iirc 512
BATCH_SIZE = 32
EPOCHS = 10
NUM_OUT = 3
LEARNING_RATE = 2e-05

# Data Config
TIME_WINDOW = 60 * 5
TRADE_DATA_PATH = "trade_data"
MESSAGE_DATA_PATH = "discord_data"

### Data Loading and Preprocessing

In [3]:
# Load the trade data for every pair in the config list
trade_data_raw_files = []
for filename in os.listdir(TRADE_DATA_PATH):
    if filename.endswith('.csv'):
        trade_data_raw_files.append(os.path.join(TRADE_DATA_PATH, filename));
        
message_data_raw_files = []
for filename in os.listdir(MESSAGE_DATA_PATH):
    if filename.endswith('.csv'):
        message_data_raw_files.append(os.path.join(MESSAGE_DATA_PATH, filename));
        
(len(trade_data_raw_files), len(message_data_raw_files))

(658, 234)

In [4]:
def round_to_nearest_window(seconds):
    return np.floor(seconds / TIME_WINDOW) * TIME_WINDOW

def vwap_signals(df, sec_window):
    # Set the time column as the index, convert it during the set_index to avoid an extra step
    df.set_index(pd.to_datetime(df['time'], unit='s'), inplace=True)
    
    # Use a more optimized way of calculating VWAP directly in the resampling
    vwap_values = df.resample(f'{sec_window}s').apply(
        lambda x: np.dot(x['price'], x['volume']) / x['volume'].sum() if not x.empty else np.nan
    )
    
    # Instead of dropping NaN, fill them with 0
    vwap_values.fillna(0, inplace=True)
    
    # Calculate the changes in VWAP and map to signals efficiently
    vwap_changes = np.sign(vwap_values.diff()).fillna(0).astype(int)
    signals = vwap_changes.map({-1: 0, 0: 1, 1: 2})
    
    # Convert index to seconds from epoch
    signals.index = (signals.index - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
    
    # Determine the start and end timestamp if signals are not empty
    if not signals.empty:
        start_timestamp = signals.index[0]
        end_timestamp = signals.index[-1]
    else:
        start_timestamp = None
        end_timestamp = None
    
    return signals, start_timestamp, end_timestamp
    
def load_trade_data(filename):
    data = pd.read_csv(filename, names=["time", "price", "volume"])
    return vwap_signals(data, TIME_WINDOW)

In [5]:
def convert_raw_messages(df, start_time, end_time):
    # Drop columns not in ['Date', 'Content']
    df.drop(columns=[col for col in df.columns if col not in ['Date', 'Content']], inplace=True)

    # Convert 'Date' from UTC to seconds from epoch
    df['Date'] = pd.to_datetime(df['Date']).astype(int) / 1e9

    # Rename the columns to 'time' and 'content'
    df.rename(columns={'Date': 'time', 'Content': 'content'}, inplace=True)

    # Filter the DataFrame to only include rows within the specified time range
    df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    
    return df

def load_messages(filename, start_time, end_time):
    data = pd.read_csv(filename)
    return convert_raw_messages(data, start_time, end_time)

#### Load the actual trade dataset and merge all the messages in each time group

In [6]:
trade_data, start_timestamp, end_timestamp = load_trade_data("trade_data/SOLUSD.csv")

In [7]:
message_data = None

for messages_file in message_data_raw_files:
    data = load_messages(messages_file, start_timestamp, end_timestamp)
    if message_data is None:
        message_data = data
    else:
        message_data = pd.concat([message_data, data], ignore_index=True)
message_data.dropna(subset=['content'], inplace=True)
message_data.reset_index(inplace=True, drop=True)

message_data = message_data.iloc[:10000] # ONLY FIRST 10000 FOR TESTING

message_data

  data = pd.read_csv(filename)


Unnamed: 0,time,content
0,1.700667e+09,This server's objective is to coordinate Solan...
1,1.675388e+09,Add the ✅ reaction to continue.\n\nAdditionall...
2,1.678841e+09,Hi
3,1.678843e+09,:hi:
4,1.678844e+09,Welcome leaf people!
...,...,...
9995,1.684080e+09,"oops, it was keccak256 not ed25519 that solang..."
9996,1.684080e+09,haha
9997,1.684080e+09,who do you think would be best to additionally...
9998,1.684080e+09,"In terms of desired usage, I'd just like to ai..."


In [8]:
class TradeMessageDataLoader(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        (time, text) = self.text.loc[index]
        
        # BERT Encoder
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        # Target
        target = self.targets[round_to_nearest_window(time)]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long),
        }

### Model Setup

In [9]:
# X = np.array(["it's going up", "you ruined my day","this is a good investment", "wow, numbers go brrrrrrr", "lol","going up","going down","going flat","it's a bear","it's a bull","butterflies are cool","git good","why do I care","get rekt nerd", "you're so bad at this","stocks tanking","I'm not too confident this'll go up","big numbers", "it'll go up", "outlook good"])
# y = np.array([0,2,0,0,1,0,2,1,2,0,1,1,1,1,1,2,2,0,0,0])

In [10]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained(MODEL)
#       self.l1 = RobertaModel.from_pretrained("FacebookAI/roberta-base")
#       self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#       self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#       pooler = self.pre_classifier(pooler)
#       pooler = torch.nn.Tanh()(pooler)
#       pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

In [12]:
tokenizer = BertTokenizer.from_pretrained(MODEL)

In [13]:
training_data = TradeMessageDataLoader(message_data, trade_data, tokenizer, MAX_LEN)
test_data = TradeMessageDataLoader(message_data, trade_data, tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

In [None]:
model = BERTClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = targs
    print('accuracy on test set {}'.format(accuracy_score(guesses.indices, targets)))

100%|█████████████████████████| 313/313 [02:08<00:00,  2.44it/s]


Epoch: 0, Loss:  0.9440982937812805


100%|█████████████████████████| 313/313 [00:48<00:00,  6.47it/s]


accuracy on test set 0.5899


100%|█████████████████████████| 313/313 [02:11<00:00,  2.38it/s]


Epoch: 1, Loss:  1.0057735443115234


100%|█████████████████████████| 313/313 [00:50<00:00,  6.20it/s]


accuracy on test set 0.6341


100%|█████████████████████████| 313/313 [02:12<00:00,  2.36it/s]


Epoch: 2, Loss:  0.9532208442687988


100%|█████████████████████████| 313/313 [00:49<00:00,  6.35it/s]


accuracy on test set 0.8057


100%|█████████████████████████| 313/313 [02:09<00:00,  2.41it/s]


Epoch: 3, Loss:  0.7888864278793335


100%|█████████████████████████| 313/313 [00:48<00:00,  6.45it/s]


accuracy on test set 0.8747


100%|█████████████████████████| 313/313 [02:09<00:00,  2.41it/s]


Epoch: 4, Loss:  0.5955169796943665


100%|█████████████████████████| 313/313 [00:48<00:00,  6.41it/s]


accuracy on test set 0.9014


  3%|▋                          | 8/313 [00:02<02:02,  2.48it/s]