In [1]:
import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score, confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import string

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm

import pandas as pd
from collections import Counter

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

device = 'cuda' if cuda.is_available() else 'cpu'
device

  torch.utils._pytree._register_pytree_node(
2024-04-26 03:17:04.547935: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 03:17:04.547970: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 03:17:04.549062: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 03:17:04.555295: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

# Static Config
Values that may have code related assumptions, changing these may require a code change in any of the following functions.

In [2]:
NUM_OUT = 3

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# MODEL_LAYER = BertModel.from_pretrained(MODEL)
MODEL_LAYER = AutoModelForSequenceClassification.from_pretrained(
    MODEL, 
    num_labels=NUM_OUT,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    ignore_mismatched_sizes=True
)
TOKENIZER = AutoTokenizer.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Utilities
Below are utilities used across the codebase, in one place.

In [3]:
def round_to_nearest_window(seconds):
    return np.floor(seconds / TIME_WINDOW) * TIME_WINDOW

def convert_raw_messages(df, start_time, end_time):
    # Drop columns not in ['Date', 'Content']
    df.drop(columns=[col for col in df.columns if col not in ['Date', 'Content']], inplace=True)

    # Convert 'Date' from UTC to seconds from epoch
    df['Date'] = pd.to_datetime(df['Date']).astype(int) / 1e9

    # Rename the columns to 'time' and 'content'
    df.rename(columns={'Date': 'time', 'Content': 'content'}, inplace=True)

    # Filter the DataFrame to only include rows within the specified time range
    df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    
    # Filter out small messages
    df = df[df['content'].str.len() >= MIN_MESSSAGE_LEN]
    
    return df

### Signal Process Functions

In [4]:
def vwap_signals(df, sec_window):
    # Set the time column as the index and convert it to datetime
    df.set_index(pd.to_datetime(df['time'], unit='s'), inplace=True)
    
    # Calculate VWAP using resampling
    vwap_values = df.resample(f'{sec_window}s').apply(
        lambda x: np.dot(x['price'], x['volume']) / x['volume'].sum() if not x.empty else np.nan
    )
    
    # Use forward fill to replace NaN values with the last known value
    vwap_values.fillna(method='ffill', inplace=True)
    # If the entire series starts with NaN, replace remaining NaNs with zero
    vwap_values.fillna(0, inplace=True)
    
    # Calculate the changes in VWAP
    vwap_changes = vwap_values.diff().fillna(0)
    # Calculate the percentage changes relative to the previous VWAP value
    percentage_changes = np.where(vwap_values.shift(1) != 0, vwap_changes / vwap_values.shift(1), 0)
    
    # Replace infinite and NaN values with 0 for signal calculation
    percentage_changes = np.nan_to_num(percentage_changes, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Map the percentage changes in VWAP to signals with DELTA_BUFFER consideration
    def signal_mapping(pct_change):
        if abs(pct_change) <= DELTA_BUFFER:
            return 1
        elif pct_change > 0:
            return 2
        else:
            return 0

    signals = pd.Series(percentage_changes, index=vwap_values.index).apply(signal_mapping)
    
    # Convert index to seconds from epoch
    signals.index = (signals.index - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
    
    # Determine the start and end timestamp if signals are not empty
    start_timestamp = signals.index[0] if not signals.empty else None
    end_timestamp = signals.index[-1] if not signals.empty else None
    
    return signals, start_timestamp, end_timestamp

### Message Process Functions

In [5]:
def noop(df):
    return df

def contains_keywords(df):
    filterWords = ["sol", "solana", "pump", "dump", "pumping", "dumping", "bullish", "bearish"]
    
    def filterFn(text):
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        return any(word in text for word in filterWords)
    
    return df[df['content'].apply(filterFn)]

# Config

In [6]:
BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 2e-05

MIN_MESSSAGE_LEN = 16
MAX_MESSSAGE_LEN = 64
TIME_WINDOW = 60 * 30 # 60 * 2
DELTA_BUFFER = 0.001

TRADE_DATA_PATH = "trade_data"
MESSAGE_DATA_PATH = "discord_data"

SIGNAL_PROCESS_FN = vwap_signals
MESSAGE_PROCESS_FN = noop

# Data Loading and Preprocessing

### Load all data directories

In [7]:
trade_data_raw_files = []
for filename in os.listdir(TRADE_DATA_PATH):
    if filename.endswith('.csv'):
        trade_data_raw_files.append(os.path.join(TRADE_DATA_PATH, filename));
        
message_data_raw_files = []
for filename in os.listdir(MESSAGE_DATA_PATH):
    if filename.endswith('.csv'):
        message_data_raw_files.append(os.path.join(MESSAGE_DATA_PATH, filename));

(len(trade_data_raw_files), len(message_data_raw_files))

(658, 3)

In [8]:
def load_trade_data(filename):
    data = pd.read_csv(filename, names=["time", "price", "volume"])
    return vwap_signals(data, TIME_WINDOW)

trade_data, start_timestamp, end_timestamp = load_trade_data("trade_data/SOLUSD.csv")

trade_data.head()

time
1623943800    1
1623945600    0
1623947400    0
1623949200    0
1623951000    0
dtype: int64

In [9]:
def load_messages(filename, start_time, end_time):
    data = pd.read_csv(filename)
    converted = convert_raw_messages(data, start_time, end_time)
    return MESSAGE_PROCESS_FN(converted)

message_data = None

for messages_file in message_data_raw_files:
    data = load_messages(messages_file, start_timestamp, end_timestamp)
    if message_data is None:
        message_data = data
    else:
        message_data = pd.concat([message_data, data], ignore_index=True)
        
message_data.dropna(subset=['content'], inplace=True)
message_data.reset_index(inplace=True, drop=True)

message_data.head()

Unnamed: 0,time,content
0,1624053000.0,"Anyone know good, safe farms on Polygon? I am ..."
1,1624054000.0,If you're looking for 'safe farm' in particula...
2,1624054000.0,+ Sushi's and Quick's not bad either
3,1624056000.0,dope yeah I hadn't looked at curve (idk why) b...
4,1624056000.0,This is my past 2 days in trading SMH .... hop...


In [10]:
class TradeMessageDataLoader(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        (time, text) = self.text.loc[index]
        
        # BERT Encoder
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_MESSSAGE_LEN,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        # Target
        target = self.targets[round_to_nearest_window(time)]
        target_tensor = torch.tensor(target, dtype=torch.long)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': target_tensor, 
            'raw_targets': target
        }

In [11]:
training_data_df, test_data_df = train_test_split(message_data, test_size=0.3, random_state=20)

training_data_df.reset_index(inplace=True, drop=True)
training_data = TradeMessageDataLoader(training_data_df, trade_data, TOKENIZER)

test_data_df.reset_index(inplace=True, drop=True)
test_data = TradeMessageDataLoader(test_data_df, trade_data, TOKENIZER)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

print(f'Training Data Entries: {len(training_data)}')
print(f'Test Data Entries: {len(test_data)}')

Training Data Entries: 101896
Test Data Entries: 43670


### Model Setup

In [12]:
# class BERTClass(torch.nn.Module):
#     def __init__(self, NUM_OUT):
#         super(BERTClass, self).__init__()
                   
#         self.l1 = BertModel.from_pretrained(MODEL)
# #       self.l1 = RobertaModel.from_pretrained("FacebookAI/roberta-base")
# #       self.pre_classifier = torch.nn.Linear(768, 256)
#         self.classifier = torch.nn.Linear(768, NUM_OUT)
# #       self.dropout = torch.nn.Dropout(0.5)
#         self.softmax = torch.nn.Softmtput)
#         return output

# class CustomModel(torch.nn.Module):
#     def __init__(self):
#         super(CustomModel, self).__init__()
#         self.base = MODEL_LAYER
#         self.n1 = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(768, NUM_OUT)
        
#         # Inference Only
#         self.softmax = torch.nn.Softmax(dim=1)
    
#     def forward(self, input_ids, attention_mask=None, token_type_ids=None):
#         outputs = self.base(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         pooled_output = outputs.pooler_output
#         n1 = self.n1(pooled_output)
#         pooled_output = self.dropout(n1)
#         logits = self.classifier(pooled_output)
#         return logits
    
#     # Apply softmax to final output
#     def predict(self, ids, mask, token_type_ids):
#         logits = self.forward(ids, mask, token_type_ids)
#         probabilities = self.softmax(logits)
#         return probabilities

# model = CustomModel()
model = MODEL_LAYER
model.to(device)

optimizer = torch.optim.AdamW (model.parameters(),
                  lr = LEARNING_RATE,
                  eps = 1e-8
                )
total_steps = len(training_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [13]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        optimizer.zero_grad();
        
        outputs = model(ids, mask, token_type_ids, labels=targets)
        loss = outputs.loss

        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)    

        optimizer.step()
        scheduler.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

            outputs = model(ids, mask, token_type_ids)
            
            logits = outputs.logits.detach().cpu().numpy()
            outputs = np.argmax(logits, axis=1).flatten()
            targets = data['raw_targets']
            
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    
    outputs, targets = validation(model, testing_loader)
    print('accuracy on test set {}'.format(accuracy_score(outputs, targets)))

  7%|███████▊                                                                                                             | 107/1593 [00:43<10:13,  2.42it/s]

### Save and load model (only for inference)

In [None]:
# torch.save(model, "model_v2.pt")

In [None]:
# model = torch.load("model_v1.pt")
# model.to(device)

## Evaluation

In [None]:
targets_list = [t.item() for t in targets]
targets_counter = Counter(targets_list)
targets_counter

### Confusion Matrix

In [None]:
cm = confusion_matrix(targets, outputs)
disp = ConfusionMatrixDisplay(confusion_matrix=cm) # , display_labels=clf.classes_
disp.plot()

### Label Accuracy Rate
Probability the model guesses the label correctly

In [None]:
def calculate_accuracy_rate(label):
    num_wrong = 0
    count = 0
    for (output, target) in zip(outputs, targets_list):
        if output != label:
            continue
        count += 1
        if output != target:
            num_wrong += 1
    return 1.0 - (num_wrong / count) if count > 0 else "N/A" 

for label in range(NUM_OUT):
    print(f"Label Accuracy Rate for {label}: {calculate_accuracy_rate(label)}")

### Label Accuracy Probability
Given that we have a label and it is correct, what is the probability the model guesses it correct.

In [None]:
def calculate_error_rate_per_label(label):
    num_wrong = 0
    count = 0
    for (output, target) in zip(outputs, targets_list):
        if target != label:
            continue
        count += 1
        if output != target:
            num_wrong += 1
    return 1 - (num_wrong / count) if count > 0 else "N/A" 

for label in range(NUM_OUT):
    print(f"Label Accuracy Probability for {label}: {calculate_error_rate_per_label(label)}")

### Most Common Baseline

In [None]:
tmp = 0
for label in range(NUM_OUT):
    val = targets_counter[label]
    if val > tmp:
        tmp = val

print(f"Most Common Baseline: {tmp / len(targets_list)}")