In [13]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler
import tqdm
from BOW_data_format import prepare_data

In [None]:
prepared = prepare_data()
train_data = prepared['train']
validation_data = prepared['validation']
test_data = prepared['test']
vocab = prepared['vocab']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


## Create Data Loaders

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def bag_of_words_collate(batch):
    data = nn.utils.rnn.pad_sequence([torch.LongTensor(np.bincount(x[0], minlength=len(vocab))) for x in batch], batch_first=True)
    labels = torch.FloatTensor([x[1] for x in batch])
    return data.to(device), labels.squeeze().to(device)

In [None]:
BATCH_SIZE = 1
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=RandomSampler(train_data), collate_fn=bag_of_words_collate)
validation_iter = DataLoader(validation_data, sampler=RandomSampler(validation_data), batch_size=1, collate_fn=bag_of_words_collate)
test_iter = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=1, collate_fn=bag_of_words_collate)

## Make the Model

In [None]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dropout=0.1):
        super(BagOfWords, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        out = self.embedding(x)
        out = out.mean(1)
        out = self.linear(out)
        out = self.sig(out).squeeze()
        return out

## Evaluation

In [None]:
def eval_model(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions).round()
    true = torch.cat(all_labels)
    recalls = []
    precisions = []
    f1_scores = []
    for selected_class in range(2):
        tp = ((pred == selected_class) & (true == selected_class)).sum()
        fp = ((pred == selected_class) & (true != selected_class)).sum()
        fn = ((pred != selected_class) & (true == selected_class)).sum()
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall != 0 else 0
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)
    return pred, true, recalls, precisions, f1_scores

def eval_on_test_set(model):
    _, _, recalls, precisions, f1_scores = eval_model(model, test_iter)
    print(f"""Run on Test Data:
    Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
    Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
    AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

def eval_summary(epoch):
        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
        print(f"""Epoch {epoch} Validation:
Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

## Training Loop

In [None]:

def train_model(model, data_iter, epochs, optimizer, scheduler, loss_func):
    epoch_average_losses = []
    with tqdm.notebook.trange(epochs, desc='training', unit='epoch') as epoch_iter:
        model.train()
        for epoch in epoch_iter:
            epoch_loss = 0
            epoch_samples = 0
            for i, (data, labels) in enumerate(data_iter, start=1):
                optimizer.zero_grad()
                output = model(data)
                loss = loss_func(output, labels.squeeze())

                epoch_loss += loss.item()
                epoch_samples += 1
                loss.backward()
                optimizer.step()
                epoch_iter.set_postfix(mean_epoch_loss=epoch_loss / i)

            avg_epoch_loss = epoch_loss / epoch_samples
            scheduler.step(epoch_loss)
            epoch_average_losses.append(avg_epoch_loss)

    return epoch_average_losses

In [None]:
from torch.optim import SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau
EPOCHS = 50

# Pre Training Stats

In [None]:
model = BagOfWords(len(vocab), 1000).to(device)
eval_on_test_set(model)
print('\n')
eval_summary(0)

# Train Model

In [None]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = SGD(model.parameters(), lr=0.5)
scheduler = ReduceLROnPlateau(optimizer, patience=3)
history = train_model(model, train_iter, EPOCHS, optimizer, scheduler, loss_func)

In [None]:
torch.save(model, 'bag_of_words.pt')
bag = torch.load('bag_of_words.pt')

# Post Training Stats

In [None]:
eval_summary(EPOCHS)
print('\n')
eval_on_test_set(bag)