In [1]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

In [2]:
corpus = Corpus(path='data')

In [3]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]#.view(-1)
    return data, target


In [4]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, n_layers, dropout):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, input_size)
        self.drop = nn.Dropout(dropout)
    
    def forward(self, inputs, hidden):

        #inputs = [batch size]

        encoded = self.drop(self.embedding(inputs))

        #encoded = [batch size, emb dim]

        encoded = encoded.unsqueeze(0)

        #encoded = [1, batch size, emb dim]

        output, hidden = self.gru(encoded, hidden)

        #output = [1, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]

        output = self.drop(output)

        output = self.fc(output)

        #output = [1, batch size, input size]

        output = output.view(-1, self.input_size)

        #output = [1*batch size, input size]

        return output, hidden
        
    def init_hidden(self, batch_size):

        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [5]:
n_characters  = len(corpus.dictionary)
emb_size = 200
hidden_size = 250
n_layers = 2
lr = 0.001
dropout = 0.5

model = RNN(n_characters, emb_size, hidden_size, n_layers, dropout)

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 21,627,468 trainable parameters


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [11]:
import torch.optim as optim


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [12]:
model=model.to(device)
criterion=criterion.to(device)

In [13]:
batch_size = 128
bptt = 10

train_loader = batchify(corpus.train, batch_size)

In [14]:
def train(model, iterator, criterion):
    clip = 0.25

    model.train()
    
    hidden = model.init_hidden(batch_size)
    hidden = hidden.to(device)
    
    # all_loss = []

    for batch, i in enumerate(range(0, iterator.size(0) - 1, bptt)):
        loss = 0
        data, targets = get_batch(iterator, i)

        seq_len_batched = data.shape[0]

        model.zero_grad()

        hidden = hidden.detach()
        

        for c in range(seq_len_batched):
            output, hidden = model(data[c], hidden)
            loss += criterion(output, targets[c])
      
        loss.backward()

        optimizer.step()
        
        total_loss = loss.item()/seq_len_batched
        
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss #/ log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(iterator) // bptt, lr,
                cur_loss, math.exp(cur_loss)))
            # total_loss = 0
            # print(generate(model, 'Wh', 100), '\n')

        # if batch % plot_every == 0:
        #     all_loss.append(total_loss)
        # return all_loss


In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
N_EPOCHS = 50

best_valid_loss = float('inf')
counter = 0
patience = 2
log_interval = 100
plot_every = 10
all_loses = []
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train(model, train_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')


| epoch   0 |   100/  558 batches | lr 0.00 | loss  6.73 | ppl   836.72
| epoch   0 |   200/  558 batches | lr 0.00 | loss  6.69 | ppl   803.45
| epoch   0 |   300/  558 batches | lr 0.00 | loss  6.60 | ppl   735.15
| epoch   0 |   400/  558 batches | lr 0.00 | loss  6.61 | ppl   744.06
| epoch   0 |   500/  558 batches | lr 0.00 | loss  6.29 | ppl   541.52
Epoch: 01 | Epoch Time: 1m 22s
| epoch   1 |   100/  558 batches | lr 0.00 | loss  6.12 | ppl   453.20
| epoch   1 |   200/  558 batches | lr 0.00 | loss  6.17 | ppl   476.75
| epoch   1 |   300/  558 batches | lr 0.00 | loss  6.14 | ppl   465.73
| epoch   1 |   400/  558 batches | lr 0.00 | loss  6.14 | ppl   462.57
| epoch   1 |   500/  558 batches | lr 0.00 | loss  5.85 | ppl   346.31
Epoch: 02 | Epoch Time: 1m 22s
| epoch   2 |   100/  558 batches | lr 0.00 | loss  5.61 | ppl   272.84
| epoch   2 |   200/  558 batches | lr 0.00 | loss  5.77 | ppl   319.38
| epoch   2 |   300/  558 batches | lr 0.00 | loss  5.67 | ppl   290.70
| 