Due to memory issues with the GPU on my laptop, Google Colab was used to train this model. You can use Colab to use a T4 GPU free of charge if you want to run this notebook there. The exact same code was run and can be viewed here:  
https://colab.research.google.com/drive/1o0dudLqBLVJ4Eq0zEIlBCL5zWu9nP_bp#scrollTo=su3veNmRFwbn

In [None]:
import random

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from sklearn.metrics import f1_score, classification_report

import torch
from torch.utils.data import DataLoader


from transformers import (BertForSequenceClassification, 
                          BertTokenizerFast,
                          PreTrainedTokenizerFast,
                          DataCollatorWithPadding,
                          AdamW,
                          get_scheduler
                         )
import datasets

In [None]:
SEED = 3141
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Modeling globals
BERT = 'nlpaueb/sec-bert-base'
BSZ = 16
MAX_LEN = 256
LR = 0.0003
EPOCHS = 5
WARMUP_STEPS = 1000

# Other globals
WEIGHT_DIR = '../weights/financial-sentiment/'

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if DEVICE == 'cuda':
    print('Using GPU!')
else:
    print('Using CPU!')

In [None]:
# Import and prep data
data = datasets.load_dataset(
    "JanosAudran/financial-reports-sec",
    "small_lite",
    split="train"
)

# Downsample to train=20k and test=2k
data = data.train_test_split(train_size=0.1, test_size=0.01, seed=SEED)


tokenizer = BertTokenizerFast.from_pretrained(BERT)

def tokenize_fx(item, tokenizer: PreTrainedTokenizerFast=tokenizer):
    return tokenizer(
        text=item['sentence'],
        truncation='longest_first',
        padding='max_length',
        max_length=MAX_LEN,
        is_split_into_words=False,
        return_tensors='pt'
    )

def get_label(item, label='30d'):
    temp = pd.DataFrame(item['labels']).astype(float)
    return temp[[label]].to_dict(orient='series')

tokenized_data = data.map(tokenize_fx, batched=True)
tokenized_data = tokenized_data.map(get_label, batched=True)

tokenized_data = tokenized_data.remove_columns([
    'cik','sentence','section','filingDate',
    'docID','sentenceID','sentenceCount','labels'
])
tokenized_data = tokenized_data.rename_column('30d', 'labels')
tokenized_data.set_format('torch', device=DEVICE)

In [None]:
# Check imbalance
prop_pos_train = tokenized_data['train']['labels'].mean() 
prop_pos_valid = tokenized_data['test']['labels'].mean()

if abs(0.5-prop_pos_train) > 0.1:
    print(f"Too much imbalance in train set {prop_pos_train,item()}")
else:
    print(f"Prop train: {prop_pos_train.item()}")
if abs(0.5-prop_pos_valid) > 0.1:
    print(f"Too much imbalance in valid set {prop_pos_valid.item()}")
else:
    print(f"Prop valid: {prop_pos_valid.item()}")

In [None]:
train_loader = DataLoader(
    tokenized_data['train'],
    shuffle=True,
    batch_size=BSZ
)
val_loader = DataLoader(
    tokenized_data['test'],
    batch_size=BSZ
)

In [None]:
# Validate loaders have properly shaped batches
exp_shapes = {
    'input_ids': torch.Size([BSZ, MAX_LEN]),
    'token_type_ids': torch.Size([BSZ, MAX_LEN]),
    'attention_mask': torch.Size([BSZ, MAX_LEN]),
    'labels': torch.Size([BSZ])
}

for nm, loader in zip(['train','val'],[train_loader, val_loader]):
    print(f"Checking {nm} loader...")
    for batch in loader:
        break
    for k,v in batch.items():
        act_shape = torch.tensor(v.shape)
        exp_shape = torch.tensor(exp_shapes[k])
        assert torch.equal(
            act_shape,
            exp_shape
        ), f'\tSize mismatch for {k}! Got {act_shape}, expected {exp_shape}'
    print("\tAll shapes correct!")

In [None]:
# Model loading and other prep
model = BertForSequenceClassification.from_pretrained(
    BERT,
    num_labels=1
)

t_batch = {}
for k,v in batch.items():
    t_batch.update({k: v.to('cpu')})

# Check the shape of the output
out = model(**t_batch)

out_shape = torch.tensor(out.logits.shape)
assert torch.equal(
    out_shape,
    torch.tensor([BSZ, 1])
), f'Output shape incorrect! Got {out_shape}, expected [{BSZ},1]'
print("Output shape good!")

assert isinstance(out.loss.detach().item(), float), "Loss is not float!"
print("Loss is float!")

In [None]:
num_training_steps = EPOCHS * len(train_loader)
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=num_training_steps
)
print(f"Num Training Steps: {num_training_steps}")

In [None]:
# Clear cuda memory
torch.cuda.empty_cache()

In [None]:
# Model training
train_losses = []
val_losses = []
val_scores = []
best_loss = np.inf

pbar = tqdm(range(EPOCHS*num_training_steps+len(val_loader)))


model.to(DEVICE)

for epoch in range(EPOCHS):
    # Training portion
    model.train()
    pbar.set_description(f"Epoch {epoch+1}: Train")
    for batch in train_loader:
        optimizer.zero_grad()
        
        out = model(**batch)
        loss = out.loss
        loss.backward()
        train_losses.append(loss.cpu())
        
        optimizer.step()
        scheduler.step()
        pbar.update(1)
        
    # Validate portion
    model.eval()
    pbar.set_description(f"Epoch {epoch+1}: Validate")
    epoch_val_preds = []
    epoch_val_true = []
    epoch_val_loss = 0
    for batch in val_loader:
        with torch.no_grad():
            out = model(**batch)
        logits = out.logits.cpu()
        preds = logits.round().numpy().flatten().tolist()
        labels = batch['labels'].cpu().numpy().flatten().tolist()
        
        epoch_val_preds += preds
        epoch_val_labels += labels
        epoch_val_loss += out.loss.cpu()
    
    # Checkpoint model if it improves
    if epoch_val_loss < best_loss:
        model.cpu().save_pretrained(WEIGHT_DIR)
        best_loss = epoch_val_loss
        
    print("Val Loss: {:.6f}".format(epoch_val_loss))
    print(classification_report(epoch_val_true, epoch_val_preds))