In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler
import tqdm
from BOW_data_format import prepare_data

In [2]:
prepared = prepare_data()
train_data = prepared['train']
validation_data = prepared['validation']
test_data = prepared['test']
vocab = prepared['vocab']
train_dates = prepared['train_dates']
validation_dates = prepared['validation_dates']
test_dates = prepared['test_dates']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


## Create Data Loaders

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
def bag_of_words_collate(batch):
    data = nn.utils.rnn.pad_sequence([torch.LongTensor(np.bincount(x[0], minlength=len(vocab))) for x in batch], batch_first=True)
    labels = torch.FloatTensor([x[1] for x in batch])
    return data.to(device), labels.squeeze().to(device)

In [4]:
BATCH_SIZE = 16
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=RandomSampler(train_data), collate_fn=bag_of_words_collate)
validation_iter = DataLoader(validation_data, sampler=RandomSampler(validation_data), batch_size=1, collate_fn=bag_of_words_collate)
test_iter = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=1, collate_fn=bag_of_words_collate)

## Make the Model

In [5]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dropout=0.1):
        super(BagOfWords, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        out = self.embedding(x)
        out = out.mean(1)
        out = self.linear(out).squeeze()
        if not self.training:
            out = self.sig(out)
        return out

## Evaluation

In [6]:
def eval_model(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=-1).squeeze().round()
    true = torch.cat(all_labels, axis=-1).squeeze().round()
    recalls = []
    precisions = []
    f1_scores = []
    for selected_class in range(2):
        tp = ((pred == selected_class) & (true == selected_class)).sum()
        fp = ((pred == selected_class) & (true != selected_class)).sum()
        fn = ((pred != selected_class) & (true == selected_class)).sum()
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall != 0 else 0
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)
    return pred, true, recalls, precisions, f1_scores

def eval_on_test_set(model):
    _, _, recalls, precisions, f1_scores = eval_model(model, test_iter)
    print(f"""Run on Test Data:
    Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
    Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
    AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

def eval_summary(epoch):
        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
        print(f"""Epoch {epoch} Validation:
Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

## Training Loop

In [7]:
def train_model(model, data_iter, epochs, optimizer, scheduler, loss_func):
    epoch_average_losses = []
    with tqdm.notebook.trange(epochs, desc='training', unit='epoch') as epoch_iter:
        model.train()
        sig = nn.Sigmoid()
        for epoch in epoch_iter:
            epoch_loss = 0
            epoch_samples = 0
            for i, (data, labels) in enumerate(data_iter, start=1):
                do_again = True
                count = 0
                while do_again:
                    count += 1
                    optimizer.zero_grad()
                    output = model(data)
                    wrong = (sig(output).round() != labels)
                    num_wrong = wrong.sum().item()
                    loss = loss_func(output, labels.squeeze())
                    epoch_loss += loss.item()
                    epoch_samples += len(labels)
                    loss.backward()
                    optimizer.step()
                    do_again = count < 10 and num_wrong > len(labels) / 8
                    if do_again:
                        data = data[wrong]
                        labels = labels[wrong]


                # loss.backward()
                # optimizer.step()
                epoch_iter.set_postfix(mean_epoch_loss=epoch_loss / i)

            avg_epoch_loss = epoch_loss / epoch_samples
            # scheduler.step(epoch_loss)
            scheduler.step(avg_epoch_loss)
            epoch_average_losses.append(avg_epoch_loss)

    return epoch_average_losses

In [8]:
from torch.optim import SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau
EPOCHS = 50

# Pre Training Stats

In [9]:
model = BagOfWords(len(vocab), 1000).to(device)
eval_on_test_set(model)
print('\n')
eval_summary(0)

Run on Test Data:
    Down: Recall: 0.0	Precision: 0	F1: 0
    Up: Recall: 1.0	Precision: 0.6176470518112183	F1: 0.7636363506317139
    AVERAGE: Recall: 0.5	Precision: 0.30882352590560913	F1: 0.38181817531585693


Epoch 0 Validation:
Down: Recall: 0.0	Precision: 0	F1: 0
Up: Recall: 1.0	Precision: 0.375	F1: 0.5454545617103577
AVERAGE: Recall: 0.5	Precision: 0.1875	F1: 0.27272728085517883


# Train Model

In [10]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = SGD(model.parameters(), lr=1)
scheduler = ReduceLROnPlateau(optimizer, patience=3, threshold=0.1)
history = train_model(model, train_iter, EPOCHS, optimizer, scheduler, loss_func)

training:   0%|          | 0/50 [00:00<?, ?epoch/s]

In [11]:
import os
if not os.path.isdir('../Saved_Models'):
    os.mkdir('../Saved_Models')
torch.save(model, '../Saved_Models/bag_of_words.pt')
bag = torch.load('../Saved_Models/bag_of_words.pt')

# Post Training Stats

In [12]:
eval_summary(EPOCHS)
print('\n')
eval_on_test_set(bag)

Epoch 50 Validation:
Down: Recall: 0.75	Precision: 0.8333333134651184	F1: 0.7894737124443054
Up: Recall: 0.75	Precision: 0.6428571343421936	F1: 0.692307710647583
AVERAGE: Recall: 0.75	Precision: 0.738095223903656	F1: 0.7408907413482666


Run on Test Data:
    Down: Recall: 0.7692307829856873	Precision: 0.38461539149284363	F1: 0.5128205418586731
    Up: Recall: 0.2380952388048172	Precision: 0.625	F1: 0.3448275923728943
    AVERAGE: Recall: 0.5036630034446716	Precision: 0.504807710647583	F1: 0.4288240671157837


# Write results to CSV

In [13]:
def run_on_data(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=-1).squeeze()
    true = torch.cat(all_labels, axis=-1).squeeze().round()
    return pred, true


In [14]:
train_pred, train_labels = run_on_data(model, train_iter,)
valid_pred, valid_labels = run_on_data(model, validation_iter)
test_pred, test_labels = run_on_data(model, test_iter)

In [15]:
import pandas as pd
train_results = pd.DataFrame({"Labels": train_labels.to('cpu').detach().numpy(), "Predictions": train_pred.to('cpu').detach().numpy(), "Date": train_dates})
valid_results = pd.DataFrame({"Labels": valid_labels.to('cpu').detach().numpy(), "Predictions": valid_pred.to('cpu').detach().numpy(), "Date": validation_dates})
test_results = pd.DataFrame({"Labels": test_labels.to('cpu').detach().numpy(), "Predictions": test_pred.to('cpu').detach().numpy(), "Date": test_dates})

train_results.to_csv('../Results/train/BOW.csv', index=False)
valid_results.to_csv('../Results/validation/BOW.csv', index=False)
test_results.to_csv('../Results/test/BOW.csv', index=False)