In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import numpy as np
import tqdm
torch.manual_seed(1);

In [2]:
with open('train.txt', 'r') as _file:
    train_text = _file.read()
train_words = []
for line in train_text.split('\n'):
    for sentence in line.split('.'):
        words = [x.lower().strip(string.punctuation + string.whitespace) for x in sentence.split() if x != '—']
        train_words.extend(words)

word_to_idx = {}
idx_to_word = {}
idx = 0
word_to_idx = {word: i for i, word in enumerate(sorted(set(train_words)))}
idx_to_word = {i: word for word, i in word_to_idx.items()}
    
data_x = []
data_y = []
n = len(train_words)
window_size = 3
for i in range(0, n - window_size * 2, 1):
    rb = i + window_size
    words_x = train_words[i:rb]
    sample_x = np.array([word_to_idx[word] for word in words_x])
    words_y = train_words[rb:rb + window_size]
    sample_y = np.array([word_to_idx[word] for word in words_y])
    data_x.append(sample_x)
    data_y.append(sample_y)
data_x = np.array(data_x)
data_y = np.array(data_y)

In [3]:
class NextWordsModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(NextWordsModel, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.RNN(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, vocab_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


In [10]:

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

model = NextWordsModel(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx))
model.cuda()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = torch.tensor(data_x[0], dtype=torch.long).cuda()
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-6.9421, -6.6660, -6.8141,  ..., -7.0963, -7.0520, -6.3555],
        [-7.2837, -6.7366, -6.7299,  ..., -6.4170, -6.8338, -7.0087],
        [-7.2510, -7.3694, -7.3524,  ..., -6.7010, -6.9471, -6.2590]],
       device='cuda:0')


In [12]:
prev_loss_mean = 0
epsilon = 0.0001
for epoch in tqdm.tqdm(range(400)): # again, normally you would NOT do 300 epochs, it is toy data
    losses = []
    for sentence, next_words in zip(data_x, data_y):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = torch.tensor(sentence, dtype=torch.long).cuda()
        targets = torch.tensor(next_words, dtype=torch.long).cuda()

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        losses.append(loss.cpu().detach().numpy())
        loss.backward()
        optimizer.step()
    cur_loss_mean = np.mean(losses)
    print(cur_loss_mean)
    if abs(cur_loss_mean - prev_loss_mean) < epsilon:
        break
    else:
        prev_loss_mean = cur_loss_mean

  0%|          | 1/400 [00:01<12:15,  1.84s/it]

0.87117445


  0%|          | 2/400 [00:03<12:02,  1.82s/it]

0.833862


  1%|          | 3/400 [00:05<11:51,  1.79s/it]

0.8062028


  1%|          | 4/400 [00:07<11:44,  1.78s/it]

0.78552604


  1%|▏         | 5/400 [00:08<11:40,  1.77s/it]

0.7715385


  2%|▏         | 6/400 [00:10<11:35,  1.77s/it]

0.7602391


  2%|▏         | 7/400 [00:12<11:32,  1.76s/it]

0.7503569


  2%|▏         | 7/400 [00:13<12:54,  1.97s/it]


KeyboardInterrupt: 

In [6]:
i = 10
for i in range(0, 100, 10):
    # See what the scores are after training
    with torch.no_grad():
        inputs = torch.tensor(data_x[i], dtype=torch.long).cuda()
        tag_scores = model(inputs)

        # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
        # for word i. The predicted tag is the maximum scoring tag.
        # Here, we can see the predicted sequence below is 0 1 2 0 1
        # since 0 is index of the maximum value of row 1,
        # 1 is the index of maximum value of row 2, etc.
        # Which is DET NOUN VERB DET NOUN, the correct sequence!
        print([idx_to_word[idx] for idx in data_x[i]])
        print([idx_to_word[idx] for idx in tag_scores.argmax(axis=1).cpu().numpy()])
        print([idx_to_word[idx] for idx in data_y[i]])
        print('-')

['не', 'позволяй', 'другим']
['не', 'твой', 'день']
['испортить', 'твой', 'день']
-
['тебе', 'радость', 'люби']
['ты', 'иногда', 'лучшее']
['себя', 'иногда', 'лучшее']
-
['это', 'не', 'думать']
['вы', 'я', 'не']
['не', 'удивляться', 'не']
-
['дыши', 'и', 'верь']
['что', 'все', 'получится']
['что', 'все', 'получится']
-
['учись', 'а', 'приобретай']
['опыт', 'не', 'читай']
['опыт', 'не', 'читай']
-
['меняйся', 'а', 'трансформируйся']
['не', 'просто', 'связывай']
['не', 'просто', 'связывай']
-
['а', 'докажи', 'не']
['несчастьях', 'а', 'поощряй']
['критикуй', 'а', 'поощряй']
-
['дай', 'не', 'просто']
['увидь', 'а', 'почувствуй']
['увидь', 'а', 'почувствуй']
-
['делай', 'не', 'достаточно']
['последний', 'а', 'слушай']
['услышать', 'а', 'слушай']
-
['покажи', 'преданность', 'вера']
['и', 'позитивное', 'отношение']
['и', 'позитивное', 'отношение']
-


In [38]:
tag_scores.shape

torch.Size([3, 895])