In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler
import tqdm
from BOW_data_format import prepare_data

In [2]:
prepared = prepare_data()
title_data = prepared['title_data']
train_data = prepared['train']
validation_data = prepared['validation']
test_data = prepared['test']
vocab = prepared['vocab']
PAD_ID = prepared['PAD_ID']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
titles = nn.utils.rnn.pad_sequence([torch.LongTensor(title_data[i]) if i in title_data else torch.LongTensor([PAD_ID]) for i in range(max(title_data) + 1)], batch_first=True).contiguous()

## Create Data Loaders

In [4]:
def bag_of_words_collate(batch):
    batch_arr = np.array(batch).T
    data = titles[batch_arr[0]]
    labels = torch.FloatTensor(batch_arr[1]).to(device)
    return data.to(device), labels.squeeze()

In [5]:
BATCH_SIZE = 4
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=RandomSampler(train_data), collate_fn=bag_of_words_collate)
validation_iter = DataLoader(validation_data, sampler=RandomSampler(validation_data), batch_size=BATCH_SIZE, collate_fn=bag_of_words_collate, drop_last=False)
test_iter = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=BATCH_SIZE, collate_fn=bag_of_words_collate, drop_last=False)

## Make the Model

In [33]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dropout=0.4):
        super(BagOfWords, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        out = self.logit(x)
        out = self.activation(out)
        return out
    def logit(self, x):
        out = self.embedding(x)
        out = self.dropout(out.mean(1))
        out = self.linear(out)
        return out


## Evaluation

In [34]:
def eval_model(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=1).squeeze()
    pred = (pred - pred.mean() + 0.5).round().to('cpu')

    true = torch.cat(all_labels, axis=1).squeeze().to('cpu')
    recalls = []
    precisions = []
    f1_scores = []
    for selected_class in range(2):
        tp = ((pred == selected_class) & (true == selected_class)).sum()
        fp = ((pred == selected_class) & (true != selected_class)).sum()
        fn = ((pred != selected_class) & (true == selected_class)).sum()
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall != 0 else 0
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)

    return pred, true, recalls, precisions, f1_scores

def eval_on_test_set(model):
    _, _, recalls, precisions, f1_scores = eval_model(model, test_iter)
    print(f"""Run on Test Data:
    Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
    Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
    AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

def eval_summary(epoch):
        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
        print(f"""Epoch {epoch} Validation:
Down: Recall: {recalls[0]}\tPrecision: {precisions[0]}\tF1: {f1_scores[0]}
Up: Recall: {recalls[1]}\tPrecision: {precisions[1]}\tF1: {f1_scores[1]}
AVERAGE: Recall: {sum(recalls) / 2}\tPrecision: {sum(precisions) / 2}\tF1: {sum(f1_scores) / 2}""")

## Training Loop

In [35]:
def train_model(model, data_iter, epochs, optimizer, scheduler, loss_func):
    epoch_average_losses = []
    with tqdm.notebook.trange(epochs, desc='training', unit='epoch') as epoch_iter:
        for epoch in epoch_iter:
            model.train()
            epoch_loss = 0
            epoch_samples = 0
            with tqdm.notebook.tqdm(data_iter, desc=f"epoch {epoch + 1}", unit='batch', total=len(data_iter)) as batch_iter:
                for i, (data, labels) in enumerate(batch_iter, start=1):
                    optimizer.zero_grad()
                    output = model.logit(data)
                    loss = loss_func(output.squeeze(), labels.squeeze())

                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item()
                    epoch_samples += data.shape[0]
                    if i == len(batch_iter):
                        _, _, recalls, precisions, f1_scores = eval_model(model, validation_iter)
                        batch_iter.set_postfix(mean_epoch_loss=epoch_loss / i, Val_F1_0=f1_scores[0].item(), Val_F1_1=f1_scores[1].item(), Val_Prec_0=precisions[0].item(), Val_Prec_1=precisions[1].item(), Val_Recall_0=recalls[0].item(), Val_Recall_1=recalls[1].item())
                    else:
                        batch_iter.set_postfix(mean_epoch_loss=epoch_loss / i)




            avg_epoch_loss = epoch_loss / epoch_samples
            scheduler.step(avg_epoch_loss)
            epoch_average_losses.append(avg_epoch_loss)

    return epoch_average_losses

In [36]:
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
EPOCHS = 50

# Pre Training Stats

In [37]:
torch.manual_seed(0)
model = BagOfWords(len(vocab), 1200).to(device)
eval_on_test_set(model)
print('\n')
eval_summary(0)

Run on Test Data:
    Down: Recall: 0.5	Precision: 0.4333333373069763	F1: 0.4642857313156128
    Up: Recall: 0.3199999928474426	Precision: 0.380952388048172	F1: 0.3478260636329651
    AVERAGE: Recall: 0.4099999964237213	Precision: 0.40714287757873535	F1: 0.40605589747428894


Epoch 0 Validation:
Down: Recall: 0.4637681245803833	Precision: 0.761904776096344	F1: 0.5765765905380249
Up: Recall: 0.6296296119689941	Precision: 0.31481480598449707	F1: 0.4197530746459961
AVERAGE: Recall: 0.5466988682746887	Precision: 0.5383597612380981	F1: 0.4981648325920105


# Train Model

In [38]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = SGD(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, patience=3, threshold=0.01)
history = train_model(model, train_iter, EPOCHS, optimizer, scheduler, loss_func)

training:   0%|          | 0/50 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 9:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 10:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 11:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 12:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 13:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 14:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 15:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 16:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 17:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 18:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 19:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 20:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 21:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 22:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 23:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 24:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 25:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 26:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 27:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 28:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 29:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 30:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 31:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 32:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 33:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 34:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 35:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 36:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 37:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 38:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 39:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 40:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 41:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 42:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 43:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 44:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 45:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 46:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 47:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 48:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 49:   0%|          | 0/99 [00:00<?, ?batch/s]

epoch 50:   0%|          | 0/99 [00:00<?, ?batch/s]

In [39]:
import os
if not os.path.isdir('../Saved_Models'):
    os.mkdir('../Saved_Models')
torch.save(model, '../Saved_Models/bag_of_words.pt')
bag = torch.load('../Saved_Models/bag_of_words.pt')

# Post Training Stats

In [40]:
eval_summary(EPOCHS)
print('\n')
eval_on_test_set(bag)

Epoch 50 Validation:
Down: Recall: 0.5507246255874634	Precision: 0.7916666865348816	F1: 0.6495726704597473
Up: Recall: 0.6296296119689941	Precision: 0.3541666567325592	F1: 0.4533333480358124
AVERAGE: Recall: 0.5901771187782288	Precision: 0.5729166865348816	F1: 0.5514529943466187


Run on Test Data:
    Down: Recall: 0.5384615659713745	Precision: 0.5833333134651184	F1: 0.559999942779541
    Up: Recall: 0.6000000238418579	Precision: 0.5555555820465088	F1: 0.576923131942749
    AVERAGE: Recall: 0.5692307949066162	Precision: 0.5694444179534912	F1: 0.568461537361145


# Write results to CSV

In [41]:
def run_on_data(model, data_iter):
    model.eval()
    predictions, all_labels= [], []
    for (data, labels) in data_iter:
        out = model(data)
        predictions.append(out.unsqueeze(0))
        all_labels.append(labels.unsqueeze(0))
    pred = torch.cat(predictions, axis=1).squeeze()
    pred -= pred.mean()
    pred += 0.5
    true = torch.cat(all_labels, axis=1).squeeze()
    return pred, true


In [42]:
train_pred, train_labels = run_on_data(model, train_iter)
valid_pred, valid_labels = run_on_data(model, validation_iter)
test_pred, test_labels = run_on_data(model, test_iter)

In [43]:
import pandas as pd
train_results = pd.DataFrame({"Labels": train_labels.to('cpu'), "Predictions": train_pred.to('cpu').detach().numpy()})
#, "Date": #train_data['Date']})
valid_results = pd.DataFrame({"Labels": valid_labels.to('cpu'), "Predictions": valid_pred.to('cpu').detach().numpy()})
#, "Date": validation_data['Date']})
test_results = pd.DataFrame({"Labels": test_labels.to('cpu'), "Predictions": test_pred.to('cpu').detach().numpy()})
#, "Date": test_data['Date']})

train_results.to_csv('../Results/train/BOW.csv', index=False)
valid_results.to_csv('../Results/validation/BOW.csv', index=False)
test_results.to_csv('../Results/test/BOW.csv', index=False)