In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler
import tqdm
from BOW_data_format import prepare_data

In [2]:
prepared = prepare_data()
title_data = prepared['title_data']
train_data = prepared['train']
validation_data = prepared['validation']
test_data = prepared['test']
vocab = prepared['vocab']
PAD_ID = prepared['PAD_ID']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
titles = nn.utils.rnn.pad_sequence([torch.LongTensor(title_data[i]) if i in title_data else torch.LongTensor([PAD_ID]) for i in range(max(title_data) + 1)], batch_first=True).contiguous()

## Create Data Loaders

In [4]:
def bag_of_words_collate(batch):
    batch_arr = np.array(batch).T
    data = titles[batch_arr[0]]
    labels = torch.FloatTensor(batch_arr[1]).to(device)
    return data.to(device), labels.squeeze()

In [50]:
BATCH_SIZE = 4
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=RandomSampler(train_data), collate_fn=bag_of_words_collate)
validation_iter = DataLoader(validation_data, sampler=RandomSampler(validation_data), batch_size=BATCH_SIZE, collate_fn=bag_of_words_collate, drop_last=False)
test_iter = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=BATCH_SIZE, collate_fn=bag_of_words_collate, drop_last=False)

## Make the Model

In [51]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dropout=0.4):
        super(BagOfWords, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        out = self.logit(x)
        out = self.activation(out)
        return out
    def logit(self, x):
        out = self.embedding(x)
        out = self.dropout(out.mean(1))
        out = self.linear(out)
        return out


## Evaluation

In [105]:
def eval_model(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=1).squeeze()
    pred = (pred - pred.mean() + 0.5).round().to('cpu')

    true = torch.cat(all_labels, axis=1).squeeze().to('cpu')
    recalls = []
    precisions = []
    f1_scores = []
    for selected_class in range(2):
        tp = ((pred == selected_class) & (true == selected_class)).sum()
        fp = ((pred == selected_class) & (true != selected_class)).sum()
        fn = ((pred != selected_class) & (true == selected_class)).sum()
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall != 0 else 0
        recalls.append(recall if recall == 0 else recall.item())
        precisions.append(precision if precision == 0 else precision.item())
        f1_scores.append(f1 if f1 == 0 else f1.item())

    return pred, true, recalls, precisions, f1_scores

def eval_on_test_set(model):
    _, _, recalls, precisions, f1_scores = eval_model(model, test_iter)
    print(f"""Run on Test Data:
    Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
    Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
    AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

def eval_summary(epoch):
        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
        print(f"""Epoch {epoch} Validation:
Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

## Training Loop

In [115]:
def train_model(model, data_iter, epochs, optimizer, scheduler, loss_func):
    epoch_average_losses = []
    with tqdm.notebook.trange(epochs, desc='training', unit='epoch') as epoch_iter:
        for epoch in epoch_iter:
            model.train()
            epoch_loss = 0
            epoch_samples = 0
            with tqdm.notebook.tqdm(data_iter, desc=f"epoch {epoch + 1}", unit='batch', total=len(data_iter)) as batch_iter:
                for i, (data, labels) in enumerate(batch_iter, start=1):
                    optimizer.zero_grad()
                    output = model.logit(data)
                    loss = loss_func(output.squeeze(), labels.squeeze())

                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item()
                    epoch_samples += data.shape[0]
                    if i == len(batch_iter):
                        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
                        batch_iter.set_postfix(mean_epoch_loss=epoch_loss / i, Val_F1_0=f1_scores[0], Val_F1_1=f1_scores[1], Val_Prec_0=precisions[0], Val_Prec_1=precisions[1], Val_Recall_0=recalls[0], Val_Recall_1=recalls[1])
                    else:
                        batch_iter.set_postfix(mean_epoch_loss=epoch_loss / i)




            avg_epoch_loss = epoch_loss / epoch_samples
            scheduler.step(avg_epoch_loss)
            epoch_average_losses.append(avg_epoch_loss)

    return epoch_average_losses

In [116]:
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
EPOCHS = 10

# Pre Training Stats

In [124]:
torch.manual_seed(0)
model = BagOfWords(len(vocab), 300, 0.1).to(device)
eval_on_test_set(model)
print('\n')
eval_summary(0)

Run on Test Data:
    Down: Recall: 0.6730769276618958	Precision: 0.7291666865348816	F1: 0.699999988079071
    Up: Recall: 0.7400000095367432	Precision: 0.6851851940155029	F1: 0.7115384340286255
    AVERAGE: Recall: 0.7065384685993195	Precision: 0.7071759402751923	F1: 0.7057692110538483


Epoch 0 Validation:
Down: Recall: 0.36231884360313416	Precision: 0.5952380895614624	F1: 0.45045045018196106
Up: Recall: 0.37037035822868347	Precision: 0.18518517911434174	F1: 0.2469135820865631
AVERAGE: Recall: 0.3663446009159088	Precision: 0.39021163433790207	F1: 0.3486820161342621


# Train Model

In [125]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, patience=3, threshold=0.01)
history = train_model(model, train_iter, EPOCHS, optimizer, scheduler, loss_func)

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 9:   0%|          | 0/198 [00:00<?, ?batch/s]

epoch 10:   0%|          | 0/198 [00:00<?, ?batch/s]

In [126]:
import os
if not os.path.isdir('../Saved_Models'):
    os.mkdir('../Saved_Models')
torch.save(model, '../Saved_Models/bag_of_words.pt')
bag = torch.load('../Saved_Models/bag_of_words.pt')

# Post Training Stats

In [127]:
eval_summary(EPOCHS)
print('\n')
eval_on_test_set(bag)

Epoch 10 Validation:
Down: Recall: 0.36231884360313416	Precision: 0.5952380895614624	F1: 0.45045045018196106
Up: Recall: 0.37037035822868347	Precision: 0.18518517911434174	F1: 0.2469135820865631
AVERAGE: Recall: 0.3663446009159088	Precision: 0.39021163433790207	F1: 0.3486820161342621


Run on Test Data:
    Down: Recall: 0.6346153616905212	Precision: 0.6111111044883728	F1: 0.6226415038108826
    Up: Recall: 0.5799999833106995	Precision: 0.6041666865348816	F1: 0.5918367505073547
    AVERAGE: Recall: 0.6073076725006104	Precision: 0.6076388955116272	F1: 0.6072391271591187


# Write results to CSV

In [128]:
def run_on_data(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=1).squeeze()
    pred -= pred.mean()
    pred += 0.5
    true = torch.cat(all_labels, axis=1).squeeze()
    return pred, true


In [129]:
train_pred, train_labels = run_on_data(model, train_iter)
valid_pred, valid_labels = run_on_data(model, validation_iter)
test_pred, test_labels = run_on_data(model, test_iter)

In [130]:
import pandas as pd
train_results = pd.DataFrame({"Labels": train_labels.to('cpu'), "Predictions": train_pred.to('cpu').detach().numpy()})
#, "Date": #train_data['Date']})
valid_results = pd.DataFrame({"Labels": valid_labels.to('cpu'), "Predictions": valid_pred.to('cpu').detach().numpy()})
#, "Date": validation_data['Date']})
test_results = pd.DataFrame({"Labels": test_labels.to('cpu'), "Predictions": test_pred.to('cpu').detach().numpy()})
#, "Date": test_data['Date']})

train_results.to_csv('../Results/train/BOW.csv', index=False)
valid_results.to_csv('../Results/validation/BOW.csv', index=False)
test_results.to_csv('../Results/test/BOW.csv', index=False)