In [1]:
import torchtext
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook

In [3]:
text = torchtext.data.Field(include_lengths = True)
label = torchtext.data.Field(sequential=False)
train, val, test = torchtext.datasets.SST.splits(text, label, filter_pred=lambda ex: ex.label != 'neutral')
text.build_vocab(train)
label.build_vocab(train)
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits((train, val, test), batch_size=10, device=-1, repeat = False)
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
text.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

In [18]:
batch = next(iter(train_iter))

In [21]:
batch.text

(Variable containing:
    132   3096     29   3112     14     14     65   3947   5485   8702
   1065     10     10   3073   2210     16     13    642   3114   8719
     57  15812   4634     10   7406      9     10      9      5     10
     48    348     12   3943     28  11715    125      7     46   1824
   4619    173      7      5   2568     31      8     18      6   2996
    245      9   7138   2963   1650     23    246     11     45      6
      9    145    142      3      5  12660     12     90    203    436
   1971     19     11     40   1531   1343   1332     13     32   2442
    977      7  13958    126   1680    136    288    667     15    843
     15   2613   6581    117      3      6    119     20    119      7
      7  11028    973    292   2680  10131     79      4     15   4167
  10680   2156   6581      3    358   2009     10    597     45   2251
     20      6      6   4574    148      3    266      8   5917      8
      4   2582   5094      8   4633   4475      8   334

In [15]:
text.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'<unk>': 0,
             '<pad>': 1,
             '.': 2,
             ',': 3,
             'the': 4,
             'and': 5,
             'of': 6,
             'a': 7,
             'to': 8,
             'is': 9,
             "'s": 10,
             'that': 11,
             'in': 12,
             'it': 13,
             'The': 14,
             'as': 15,
             'film': 16,
             'with': 17,
             'movie': 18,
             'but': 19,
             'for': 20,
             'A': 21,
             'its': 22,
             'an': 23,
             'this': 24,
             'you': 25,
             "n't": 26,
             'be': 27,
             '...': 28,
             'It': 29,
             'on': 30,
             'by': 31,
             '--': 32,
             'has': 33,
             'are': 34,
             'about': 35,
             'more': 36,
             'not': 37,
             'than': 38,
             'at': 39,

In [None]:
class CBoW(nn.Module):
    def __init__(self, input_size, num_classes, batch_size):
        super(CBoW, self).__init__()
        self.embeddings = nn.Embedding(text.vocab.vectors.size()[0], text.vocab.vectors.size()[1])
        self.embeddings.weight.data.copy_(text.vocab.vectors)
        self.linear = nn.Linear(input_size+1, num_classes, bias = True)
    
    def forward(self, x):
        x, lengths = x
        lengths = Variable(lengths.view(-1, 1).float())
        embedded = self.embeddings(x)
        average_embed = embedded.mean(0)
        concat = torch.cat([average_embed, lengths], dim = 1) # add lengths as a feature
        output = self.linear(concat)
        logits = torch.nn.functional.log_softmax(output, dim = 1)
        return logits

    def predict(self, x):
        logits = self.forward(x)
        return logits.max(1)[1] + 1
    
    def train(self, train_iter, val_iter, test_iter, num_epochs, learning_rate = 1e-3):
        criterion = torch.nn.NLLLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
        loss_vec = []
        
        for epoch in tqdm_notebook(range(1, num_epochs + 1)):
            epoch_loss = 0
            for batch in train_iter:
                x = batch.text
                y = batch.label
                
                optimizer.zero_grad()
                
                y_p = self.forward(x)
                
                loss = criterion(y_p, y-1)
                loss.backward()
                
                optimizer.step()
                epoch_loss += loss.data[0]
                
            self.model = model
            
            loss_vec.append(epoch_loss / len(train_iter))
            if epoch % 1 == 0:
                acc = self.validate(val_iter)
                print('Epoch {} loss: {} | acc: {}'.format(epoch, loss_vec[epoch-1], acc))
                self.model = model
                self.test(test_iter)
                
        plt.plot(range(len(loss_vec)), loss_vec)
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.show()
        print('\nModel trained.\n')
        self.loss_vec = loss_vec
        self.model = model

    def test(self, test_iter):
        "All models should be able to be run with following command."
        upload, trues = [], []
        # Update: for kaggle the bucket iterator needs to have batch_size 10
        for batch in test_iter:
            # Your prediction data here (don't cheat!)
            x, y = batch.text, batch.label
            preds = self.predict(x)
            upload += list(preds.data.numpy())
            trues += list(y.data.numpy())
            
        correct = sum([1 if i == j else 0 for i, j in zip(upload, trues)])
        accuracy = correct / len(trues)
        print('Test Accuracy:', accuracy)

        with open("predictions.txt", "w") as f:
            for u in upload:
                f.write(str(u) + "\n")
                
    def validate(self, val_iter):
        y_p, y_t, correct = [], [], 0
        for batch in val_iter:
            x, y = batch.text, batch.label
            probs = self.model.predict(x)[:len(y.data.numpy())]
            y_p += list(probs.data.numpy())
            y_t += list(y.data.numpy())
        correct = sum([1 if i == j else 0 for i, j in zip(y_p, y_t)])
        accuracy = correct / len(y_p)
        return accuracy

In [None]:
model = CBoW(input_size = 300, num_classes = 2, batch_size = 10)
model.train(train_iter = train_iter, val_iter = val_iter, test_iter = test_iter, num_epochs = 15, learning_rate = 1e-4, plot = False)
model.test(test_iter)