In [1]:
!pip install pytorch_pretrained_bert pytorch-nlp



In [2]:
# Downlod the dataset

! rm -rf data
! rm *.csv*
! wget https://storage.googleapis.com/akhilez/datasets/singularity_systems/test_bert.csv
! wget https://storage.googleapis.com/akhilez/datasets/singularity_systems/train_bert.csv
! mkdir -p data
! mv *.csv data/


rm: cannot remove '*.csv*': No such file or directory
--2020-08-01 03:28:34--  https://storage.googleapis.com/akhilez/datasets/singularity_systems/test_bert.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.128, 172.217.204.128, 172.217.203.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16310138 (16M) [text/csv]
Saving to: ‘test_bert.csv’


2020-08-01 03:28:34 (135 MB/s) - ‘test_bert.csv’ saved [16310138/16310138]

--2020-08-01 03:28:35--  https://storage.googleapis.com/akhilez/datasets/singularity_systems/train_bert.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.13.128, 172.217.204.128, 172.217.203.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.13.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23291221 (22M) [text/csv]
Saving to: ‘train_bert.csv’


2020-08-01

In [3]:
import torch
from pytorch_pretrained_bert import BertModel
import csv
import numpy as np
from torch import nn
import math
import matplotlib.pyplot as plt

In [4]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
class BatchGenerator:

    def __init__(self, data_path, batch_size):
        self.batch_size = batch_size
        self.data_path = data_path
        self.n_labels = 20

    def get_batch_gen(self, repeat=True):
        while True:
            with open(self.data_path) as data_file:
                reader = csv.reader(data_file)
                batch = []
                for row in reader:  # TODO: Can the next batch be fetched asynchronously? With asyncio?
                    batch.append(row)
                    if len(batch) >= self.batch_size:
                        batch = self._split_batch(batch)
                        yield batch
                        batch = []
                if 0 < len(batch) < self.batch_size:
                    yield self._split_batch(batch)
            if not repeat:
                break

    def _split_batch(self, batch):
        batch = np.array(batch, dtype=int)
        x = batch[:, :len(batch[0]) - self.n_labels]
        y = batch[:, len(batch[0]) - self.n_labels:]
        return x, y


In [6]:
class BertEmailClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertEmailClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 500)
        self.linear2 = nn.Linear(500, 20)

    def forward(self, x, masks=None):
        _, x = self.bert(x, attention_mask=masks, output_all_encoded_layers=False)
        x = self.dropout1(x)
        x = self.dropout2(nn.functional.relu(self.linear1(x)))
        x = nn.functional.softmax(self.linear2(x), dim=1)
        return x

bert_clf = BertEmailClassifier().to(device)
optim = torch.optim.Adam(list(bert_clf.linear1.parameters()) + list(bert_clf.linear2.parameters()))


In [7]:
train_batch_size = 10
test_batch_size = 5
train_size = 11083
test_size = 7761
train_steps = train_size / train_batch_size
test_steps = test_size / test_batch_size

train_gen = BatchGenerator(data_path='data/train_bert.csv', batch_size=train_batch_size).get_batch_gen()
test_gen = BatchGenerator(data_path='data/test_bert.csv', batch_size=test_batch_size).get_batch_gen()

In [8]:
train_steps

1108.3

In [9]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
torch.cuda.empty_cache()
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')

441.201664M
441.201664M


In [10]:
class Metrics:
    def __init__(self):
        self.losses = []
        self.accuracies = []

        self._epoch_loss = 0
        self._epoch_accuracy = 0

        self.n_batches = 0
        self.n_epochs = 0

    def record_batch(self, loss, accuracy):
        self.n_batches += 1
        self._epoch_loss += loss
        self._epoch_accuracy += accuracy

    def record_epoch(self):
        self.losses.append(self._epoch_loss/self.n_batches)
        self.accuracies.append(self._epoch_accuracy/self.n_batches)

        self.n_epochs += 1
        self._epoch_loss = 0
        self._epoch_accuracy = 0
        self.n_batches = 0

    @staticmethod
    def find_accuracy(y_hat, y_real):
        max_args_equals = torch.argmax(y_hat, dim=1) == torch.argmax(y_real, dim=1)
        return sum([1 if value else 0 for value in max_args_equals]) / len(y_hat)


In [11]:
def record_test_metrics(metrics):
    with torch.no_grad():
        for batch_i in range(math.ceil(test_steps)):

            x_batch, y_batch = next(test_gen)
            x_batch = torch.tensor(x_batch).to(device)
            y_batch = torch.tensor(y_batch, dtype=torch.float32).to(device)
            x_batch_masks = torch.tensor([[float(i > 0) for i in ii] for ii in x_batch]).to(device)

            y_hat = bert_clf(x_batch, x_batch_masks)

            loss = nn.functional.binary_cross_entropy(y_hat, y_batch)

            metrics.record_batch(loss.item(), Metrics.find_accuracy(y_hat, y_batch))

            if batch_i % 200 == 0:
                print(loss.item())

        metrics.record_epoch()

In [12]:
def train(epochs, train_metrics, test_metrics):

    for epoch in range(epochs):

        for batch_i in range(math.ceil(train_steps)):

            x_batch, y_batch = next(train_gen)
            y_batch = torch.tensor(y_batch, dtype=torch.float32).to(device)
            x_batch_masks = torch.tensor([[float(i > 0) for i in ii] for ii in x_batch]).to(device)
            x_batch = torch.tensor(x_batch).to(device)

            optim.zero_grad()

            y_hat = bert_clf(x_batch, x_batch_masks)

            loss = nn.functional.binary_cross_entropy(y_hat, y_batch)

            loss.backward()
            optim.step()

            train_metrics.record_batch(loss.item(), Metrics.find_accuracy(y_hat, y_batch))

            if batch_i % 200 == 0:
                print(loss.item())

        train_metrics.record_epoch()
        record_test_metrics(test_metrics)

        print(f'Epoch: {epoch}, train_loss={train_metrics.losses[-1]}, train_accuracy={train_metrics.accuracies[-1]}, val_loss={test_metrics.losses[-1]}, val_accuracy={test_metrics.accuracies[-1]}')

train_metrics = Metrics()
test_metrics = Metrics()

train(5, train_metrics, test_metrics)

0.202034130692482
0.19044330716133118
0.19049082696437836
0.16164638102054596
0.17933432757854462
0.16792289912700653
0.2199738323688507
0.18984578549861908
0.15281912684440613
0.18067912757396698
0.1672685593366623
0.16764213144779205
0.195363387465477
0.20293845236301422
Epoch: 0, train_loss=0.18593045756073015, train_accuracy=0.12669071235347126, val_loss=0.1828169930349683, val_accuracy=0.1448808757244033
0.18299132585525513
0.1499571055173874
0.15501031279563904
0.14058807492256165
0.14288629591464996
0.14687983691692352
0.1924676150083542
0.1758795529603958
0.11508718132972717
0.150018572807312
0.14842583239078522
0.17336471378803253
0.15991012752056122
0.16224698722362518


KeyboardInterrupt: ignored

In [None]:
plt.plot(train_metrics.accuracies)
plt.plot(test_metrics.accuracies)
plt.xlabel("Epochs")
plt.ylabel('accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
plt.plot(train_metrics.losses)
plt.plot(test_metrics.losses)
plt.xlabel("Epochs")
plt.ylabel('loss')
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
torch.save(bert_clf, 'bert_clf2.pt')

In [None]:
from google.colab import files
files.download('bert_clf2.pt')