In [1]:
import spacy
import numpy as np
from spacy.symbols import ORTH
from torchtext import data, datasets
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("Use CUDA:", USE_CUDA)

Use CUDA: True


# Hyperparameters

In [3]:
nhid = 200
embed_dim = 300
lr = 10
NUM_EPOCHS = 20
bptt_len = 60
batch_size = 32
save_path = 'model.pt'

# Read Data

In [4]:
train_file = 'train.txt'
dev_file = 'dev.txt'

In [5]:
lm_tok = spacy.load('en')
def spacy_tok(x):
    return [tok.text for tok in lm_tok.tokenizer(x)]

TEXT = data.ReversibleField(sequential=True, tokenize=spacy_tok,
                            lower=True, include_lengths=False)

In [6]:
train_dataset = datasets.LanguageModelingDataset(train_file, TEXT, newline_eos=True)
dev_dataset = datasets.LanguageModelingDataset(dev_file, TEXT, newline_eos=True)

In [7]:
vectors = "glove.840B.300d"
TEXT.build_vocab(train_dataset, dev_dataset, vectors=vectors)

In [8]:
# iterators
train_iter = data.BPTTIterator(train_dataset, batch_size=batch_size, bptt_len=bptt_len, repeat=False, shuffle=True)
dev_iter = data.BPTTIterator(dev_dataset, batch_size=batch_size, bptt_len=bptt_len, repeat=False)

In [9]:
len(TEXT.vocab)

26246

In [10]:
# Create embeddings
embedding = nn.Embedding(len(TEXT.vocab), embed_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors)
embedding.weight.requires_grad = False
embedding = embedding.to(device)

# Define Model

In [11]:
class LM(nn.Module):
    def __init__(self, ntoken, ninp, nhid, embedding, dropout=0.5):
        super(LM, self).__init__()
        self.nhid = nhid
        self.encoder = embedding
        self.rnn = nn.LSTM(ninp, nhid, batch_first=True)
        self.decoder = nn.Linear(nhid, ntoken)
        self.embed_drop = nn.Dropout(dropout)
        self.output_drop = nn.Dropout(dropout)
#         self.embed_drop = LockedDropout(dropout)
#         self.output_drop = LockedDropout(dropout)

#         # # tie weights
#         self.decoder.weight = self.encoder.weight

    def forward(self, inputs, hidden=None):
        """

        :param inputs: (batch_size, max_len)
        :param hidden: ((1, batch_size, nhid), (1, batch_size, nhid))
        :return:
        """
        emb = self.embed_drop(self.encoder(inputs))
        if hidden:
            outputs, hidden = self.rnn(emb, hidden)
        else:
            outputs, hidden = self.rnn(emb)

        outputs = self.output_drop(outputs)
        decoded = self.decoder(outputs)
        return decoded, outputs, hidden

In [12]:
lm = LM(len(TEXT.vocab), embed_dim, nhid, embedding).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(lm.parameters(), lr=lr)

# Training and Evaluation

In [13]:
def train_epoch():
    losses = []
    for batch in train_iter:
        x, y = batch.text.transpose(0, 1).contiguous().to(device), \
                   batch.target.transpose(0, 1).contiguous().to(device)
        
        out, _, _ = lm(x)
        
        out = out.contiguous().view(-1, len(TEXT.vocab))
        y = y.view(-1)

        loss = criterion(out, y).to(device)
        losses.append(loss.item())

        # update model
        optimizer.zero_grad()
        loss.backward()
        # _ = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_clipping)
        optimizer.step()

    return np.exp(np.mean(losses))

In [14]:
def eval_epoch():
    losses = []
    for batch in dev_iter:
        x, y = batch.text.transpose(0, 1).contiguous().to(device), \
                   batch.target.transpose(0, 1).contiguous().to(device)
        
        with torch.no_grad():
            out, _, _ = lm(x)
        
        out = out.contiguous().view(-1, len(TEXT.vocab))
        y = y.view(-1)

        loss = criterion(out, y).to(device)
        losses.append(loss.item())
            
    return np.exp(np.mean(losses))

In [15]:
def train_model():
    train_losses = []
    dev_losses = []
    for epoch in range(NUM_EPOCHS):
        loss_train = train_epoch()
        loss_dev = eval_epoch()

        print('train perplexity: %.4f, dev perplexity: %.4f' % (loss_train, loss_dev))

        train_losses.append(loss_train)
        dev_losses.append(loss_dev)
        
        if loss_dev == min(dev_losses):
            torch.save(lm, save_path)

In [16]:
# train_model()

# Predict

In [44]:
lm = torch.load(save_path)
criterion = nn.CrossEntropyLoss(reduction='none').to(device)

In [57]:
def eval_epoch():
    pairs = []

    for i, batch in enumerate(dev_iter):
        x, y = batch.text.transpose(0, 1).contiguous().to(device), \
                   batch.target.transpose(0, 1).contiguous().to(device)
        
        with torch.no_grad():
            out, _, _ = lm(x)
        
        out = out.contiguous().view(-1, len(TEXT.vocab))
        y = y.view(-1)

        loss = criterion(out, y).to(device)
        loss = loss.view(batch_size, -1)
        loss = torch.mean(loss, dim=1)

        for ele in zip(x.cpu().numpy(), loss.cpu().numpy()):
            pairs.append(ele)

    pairs.sort(key=lambda x: x[1])
    return pairs

In [58]:
pairs = eval_epoch()

In [68]:
def indices_to_sent(indices):
    sent = [TEXT.vocab.itos[ele] for ele in indices]
    return ' '.join(sent)

In [71]:
# good sentences
for p in pairs[:10]:
    sent, score = p
    print('~' * 100)
    print(score)
    print(indices_to_sent(sent))

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2.4478726
far away <eos> do you see signs for the train <eos> they 're a little further away so i ca n't see them <eos> are there trees in the photo <eos> can you tell if this is a male or female <eos> does the dog have long hair or short <eos> what type of vehicle a car truck <eos> no
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2.472043
are they in a box <eos> there is n’t any grass <eos> do they have leaves in their mouth <eos> is there anything else in the scene besides the elephants <eos> can you tell if this is taken in winter time <eos> are there any beverages in scene <eos> i 'm not sure , i do n't see water <eos>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2.4836626
i think it 's black , but the photo is in black
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [72]:
# bad sentences
for p in pairs[-10:]:
    sent, score = p
    print('~' * 100)
    print(score)
    print(indices_to_sent(sent))

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5.798269
their jacket <eos> outdoor platform <eos> squash greens <eos> a laptop 3 books an older ipod a bunch of cords a bunch of pens a notebook pill bottles more cords converters , netflix envelopes <eos> yes they look like they are <eos> is suitcase sitting on grass or concrete <eos> is the room well organized <eos> shirt and old pants
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5.824169
the cats have their eyes open <eos> is she in the dirt <eos> has any pieces been taken out <eos> ' before you leave , have you logged off , pushed your chair under desk , tie dyed up , shut down if it is end of day , left room ready for next class <eos> it looks delicious <eos>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5.8883324
letters and numbers <eos> people are selling different thi