# Shakespeare Language Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import time

import shakespeare_data as sh

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

## Fixed length input

In [2]:
# Data - refer to shakespeare_data.py for details
corpus = sh.read_corpus()
print("First 203 characters...Last 50 characters")
print("{}...{}".format(corpus[:203], corpus[-50:]))
print("Total character count: {}".format(len(corpus)))
chars, charmap = sh.get_charmap(corpus)
charcount = len(chars)
print("Unique character count: {}\n".format(len(chars)))
shakespeare_array = sh.map_corpus(corpus, charmap)
print("shakespeare_array.shape: {}\n".format(shakespeare_array.shape))
small_example = shakespeare_array[:17]
print("First 17 characters as indices", small_example)
print("First 17 characters as characters:", [chars[c] for c in small_example])
print("First 17 character indices as text:\n", sh.to_text(small_example,chars))

First 203 characters...Last 50 characters
1609
 THE SONNETS
 by William Shakespeare
                      1
   From fairest creatures we desire increase,
   That thereby beauty's rose might never die,
   But as the riper should by time decease,
...,
   And new pervert a reconciled maid.'
 THE END

Total character count: 5551930
Unique character count: 84

shakespeare_array.shape: (5551930,)

First 17 characters as indices [12 17 11 20  0  1 45 33 30  1 44 40 39 39 30 45 44]
First 17 characters as characters: ['1', '6', '0', '9', '\n', ' ', 'T', 'H', 'E', ' ', 'S', 'O', 'N', 'N', 'E', 'T', 'S']
First 17 character indices as text:
 1609
 THE SONNETS


In [3]:
# Dataset class. Transform raw text into a set of sequences of fixed length, and extracts inputs and targets
class TextDataset(Dataset):
    
    def __init__(self,text, seq_len = 200):
        n_seq = len(text) // seq_len
        text = text[:n_seq * seq_len]
        self.data = torch.tensor(text).view(-1,seq_len)
    
    def __getitem__(self,i):
        txt = self.data[i]
        
        # labels are the input sequence shifted by 1
        return txt[:-1],txt[1:]
    
    def __len__(self):
        return self.data.size(0)

# Collate function. Transform a list of sequences into a batch. Passed as an argument to the DataLoader.
# Returns data of the format seq_len x batch_size
def collate(seq_list):
    inputs = torch.cat([s[0].unsqueeze(1) for s in seq_list],dim=1)
    targets = torch.cat([s[1].unsqueeze(1) for s in seq_list],dim=1)
    return inputs,targets


In [4]:
# Model
class CharLanguageModel(nn.Module):

    def __init__(self,vocab_size,embed_size,hidden_size, nlayers):
        super(CharLanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size) # Embedding layer
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers) # Recurrent network
        # You can also try GRUs instead of LSTMs.
        
        self.scoring = nn.Linear(hidden_size,vocab_size) # Projection layer
        
    def forward(self,seq_batch): #L x N
        # returns 3D logits
        batch_size = seq_batch.size(1)
        embed = self.embedding(seq_batch) #L x N x E
        hidden = None
        output_lstm,hidden = self.rnn(embed,hidden) #L x N x H
        output_lstm_flatten = output_lstm.view(-1,self.hidden_size) #(L*N) x H
        output_flatten = self.scoring(output_lstm_flatten) #(L*N) x V
        return output_flatten.view(-1,batch_size,self.vocab_size)
    
    def generate(self,seq, n_words): # L x V
        # performs greedy search to extract and return words (one sequence).
        generated_words = []
        embed = self.embedding(seq).unsqueeze(1) # L x 1 x E
        hidden = None
        output_lstm, hidden = self.rnn(embed,hidden) # L x 1 x H
        output = output_lstm[-1] # 1 x H
        scores = self.scoring(output) # 1 x V
        _,current_word = torch.max(scores,dim=1) # 1 x 1
        generated_words.append(current_word)
        if n_words > 1:
            for i in range(n_words-1):
                embed = self.embedding(current_word).unsqueeze(0) # 1 x 1 x E
                output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
                output = output_lstm[0] # 1 x H
                scores = self.scoring(output) # V
                _,current_word = torch.max(scores,dim=1) # 1
                generated_words.append(current_word)
        return torch.cat(generated_words,dim=0)
        
        

In [10]:
for batch_idx, (inputs,targets) in enumerate(train_loader):
     print(inputs.size())
     print(targets.size())
     tg = targets.view(-1)
     print(tg.size())
        
     if batch_idx  > 2:
        break

torch.Size([199, 64])
torch.Size([199, 64])
torch.Size([12736])
torch.Size([199, 64])
torch.Size([199, 64])
torch.Size([12736])
torch.Size([199, 64])
torch.Size([199, 64])
torch.Size([12736])
torch.Size([199, 64])
torch.Size([199, 64])
torch.Size([12736])


In [7]:
def train_epoch(model, optimizer, train_loader, val_loader):
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(DEVICE)
    before = time.time()
    print("training", len(train_loader), "number of batches")
    for batch_idx, (inputs,targets) in enumerate(train_loader):
        if batch_idx == 0:
            first_time = time.time()
        inputs = inputs.to(DEVICE)
        targets = targets.to(DEVICE)
        outputs = model(inputs) # 3D
        loss = criterion(outputs.view(-1,outputs.size(2)),targets.view(-1)) # Loss of the flattened outputs
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx == 0:
            print("Time elapsed", time.time() - first_time)
            
        if batch_idx % 100 == 0 and batch_idx != 0:
            after = time.time()
            print("Time: ", after - before)
            print("Loss per word: ", loss.item() / batch_idx)
            print("Perplexity: ", np.exp(loss.item() / batch_idx))
            after = before
    
    val_loss = 0
    batch_id=0
    for inputs,targets in val_loader:
        batch_id+=1
        inputs = inputs.to(DEVICE)
        targets = targets.to(DEVICE)
        outputs = model(inputs)
        loss = criterion(outputs.view(-1,outputs.size(2)),targets.view(-1))
        val_loss+=loss.item()
    val_lpw = val_loss / batch_id
    print("\nValidation loss per word:",val_lpw)
    print("Validation perplexity :",np.exp(val_lpw),"\n")
    return val_lpw
    

In [8]:
model = CharLanguageModel(charcount,256,256,3)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
split = 5000000
train_dataset = TextDataset(shakespeare_array[:split])
val_dataset = TextDataset(shakespeare_array[split:])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate, drop_last=True)

In [0]:
for i in range(3):
    train_epoch(model, optimizer, train_loader, val_loader)

training 391 number of batches
Time elapsed 3.9039366245269775


In [0]:
def generate(model, seed,nwords):
    seq = sh.map_corpus(seed, charmap)
    seq = torch.tensor(seq).to(DEVICE)
    out = model.generate(seq,nwords)
    return sh.to_text(out.cpu().detach().numpy(),chars)

In [0]:
print(generate(model, "To be, or not to be, that is the q",8))

In [0]:
print(generate(model, "Richard ", 1000))

## Packed sequences

In [3]:
stop_character = charmap['\n']
space_character = charmap[" "]
lines = np.split(shakespeare_array, np.where(shakespeare_array == stop_character)[0]+1) # split the data in lines
shakespeare_lines = []
for s in lines:
    s_trimmed = np.trim_zeros(s-space_character)+space_character # remove space-only lines
    if len(s_trimmed)>1:
        shakespeare_lines.append(s)
for i in range(10):
    ##print(sh.to_text(shakespeare_lines[i],chars))
    print(shakespeare_lines[i])
print(len(shakespeare_lines))

[12 17 11 20  0]
[ 1 45 33 30  1 44 40 39 39 30 45 44  0]
[ 1 57 80  1 48 64 67 67 64 56 68  1 44 63 56 66 60 74 71 60 56 73 60  0]
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0]
[ 1  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60
 74  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0]
[ 1  1  1 45 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74
  1 73 70 74 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0]
[ 1  1  1 27 76 75  1 56 74  1 75 63 60  1 73 64 71 60 73  1 74 63 70 76
 67 59  1 57 80  1 75 64 68 60  1 59 60 58 60 56 74 60  8  0]
[ 1  1  1 33 64 74  1 75 60 69 59 60 73  1 63 60 64 73  1 68 64 62 63 75
  1 57 60 56 73  1 63 64 74  1 68 60 68 70 73 80 21  0]
[ 1  1  1 27 76 75  1 75 63 70 76  1 58 70 69 75 73 56 58 75 60 59  1 75
 70  1 75 63 64 69 60  1 70 78 69  1 57 73 64 62 63 75  1 60 80 60 74  8
  0]
[ 1  1  1 31 60 60 59  5 74 75  1 75 63 80  1 67 64 62 63 75  5 74  1 61
 67 56 68 60  1 78 64

In [58]:
import numpy as np

import torch
from torch import nn
from torch.nn.utils.rnn import *

path_train = 'wsj0_dev.npy'
path_train_lables = 'wsj0_dev_merged_labels.npy'
path_test = 'wsj0_test.npy'

train_data = np.load(path_train, allow_pickle = True, encoding='bytes')
print(train_data.shape)
print(train_data[0].shape)



train_labels = np.load(path_train_lables, allow_pickle = True, encoding= 'bytes')
test = np.load(path_test, allow_pickle = True, encoding='bytes')


timeframe_length = []
train_data_list = []

for i in range(train_data.shape[0]):
    tlen = train_data[i].shape[0]
    timeframe_length.append(tlen*40)
    train_data_list.append(torch.DoubleTensor(train_data[i]))
    
print("FLAT")
train_data_flat = torch.cat(train_data_list)
print(train_data_flat.size())
max_timeframe_length = max(timeframe_length)
timeframe_length = torch.LongTensor(timeframe_length)



output_length = []
train_labels_flat = []
for i in range(train_labels.shape[0]):
    output_length.append(train_labels[i].shape[0])
    train_labels_flat.append(torch.LongTensor(train_labels[i]))

##train_labels_flat = torch.LongTensor(train_labels_flat)
output_length = torch.LongTensor(output_length)
max_output_length = max(output_length)


##X1 = pad_sequence(train_data_flat)
train_labels_flat = pad_sequence(train_labels_flat,batch_first=True )

print(train_data_flat.size())

print(train_labels_flat.size())


(1106,)
(440, 40)
FLAT
torch.Size([693218, 40])
torch.Size([693218, 40])
torch.Size([1106, 193])


In [5]:
class LinesDataset(Dataset):
    def __init__(self,lines):
        self.lines=[torch.tensor(l) for l in lines]
    def __getitem__(self,i):
        line = self.lines[i]
        return line[:-1].to(DEVICE),line[1:].to(DEVICE)
    def __len__(self):
        return len(self.lines)

# collate fn lets you control the return value of each batch
# for packed_seqs, you want to return your data sorted by length
def collate_lines(seq_list):
    ##print(len(seq_list))
    inputs,targets = zip(*seq_list)
    lens = [len(seq) for seq in inputs]
    seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
    ##print(seq_order)
    inputs = [inputs[i] for i in seq_order]
    targets = [targets[i] for i in seq_order]
    return inputs,targets

In [63]:
vocab_size = 84
embed_size = 256

embedding = nn.Embedding(vocab_size,embed_size)
embedding.to(DEVICE)

split = 100000
train_dataset = LinesDataset(shakespeare_lines[:split])

print(train_dataset)


##train_data_flat
##train_labels_flat


train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_lines)

i = 0
for inputs,targets in train_loader:
    print(inputs[0].size())
    
    seq_list = inputs
    
    batch_size = len(seq_list)
    print("Batchsize")
    print(batch_size)
    
    lens = [len(s) for s in seq_list] # lens of all lines (already sorted)
    print("lens")
    print(lens)
    print(sum(lens))
    bounds = [0]
    for l in lens:
        bounds.append(bounds[-1]+l) # bounds of all lines in the concatenated sequence. Indexing into the list to 
        ##print(l)
        ##print(bounds[-1])  # see where the sequence occurs. Need this at line marked **
        
    print(len(seq_list))
    print(seq_list[0].size())
    seq_concat = torch.cat(seq_list) # concatenated sequence
    print("seq_concat")
    print(seq_concat.size())
    print(seq_concat[0:20])
    embed_concat = embedding(seq_concat) # concatenated embeddings
    
    print("Embedd output")
    print(embed_concat.size())
    print(embed_concat[0,:])
    
    embed_list = [embed_concat[bounds[i]:bounds[i+1]] for i in range(batch_size)]
    
    print("Embedd List")
    print(embed_concat.size())
    print(len(embed_list))
    print(embed_list[0].size())
    packed_input = rnn.pack_sequence(embed_list)
    
    
    print("Packed Input")
    print(len(packed_input))
    
    print(packed_input[0].size())
    print(packed_input[1].size())
    print(packed_input[2])
    print(packed_input[3])
    
    
    hidden = None
    
    rnn_x = nn.LSTM(input_size = embed_size,hidden_size=256,num_layers=3)
    rnn_x.to(DEVICE)
    output_packed,hidden = rnn_x(packed_input,hidden)
    
    print(len(output_packed))
    print(output_packed[0].size())
    print(output_packed[1].size())
    print(output_packed[2])
    print(output_packed[3])
    
    print(packed_input[0][0,:])
    print(packed_input[1][2])
    
    print(output_packed[0][0,:])
    print(output_packed[1][2])
    
    
    output_padded, _ = rnn.pad_packed_sequence(output_packed)
    
    print("output pad packed")
    print(output_padded.size())
    
    output_flatten = torch.cat([output_padded[:lens[i],i] for i in range(batch_size)])
    
    print(output_flatten.size())
    
    scoring = nn.Linear(256,vocab_size)
    
    scoring.to(DEVICE)
    
    scores_flatten = scoring(output_flatten)
    
    print(scores_flatten.size())
    
    
    
    i = i + 1
    if i == 1:
        break


<__main__.LinesDataset object at 0x7fa1ec97f668>
torch.Size([70])
Batchsize
64
lens
[70, 70, 69, 68, 68, 67, 66, 65, 64, 62, 60, 58, 57, 56, 55, 55, 54, 51, 51, 51, 51, 51, 50, 50, 49, 49, 48, 48, 48, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45, 45, 45, 45, 44, 43, 42, 41, 41, 41, 41, 41, 40, 39, 34, 32, 32, 31, 30, 27, 27, 20, 17]
3068
64
torch.Size([70])
seq_concat
torch.Size([3068])
tensor([ 1,  1,  1,  1,  1, 26, 69, 59,  1, 74, 68, 70, 75, 60,  1, 63, 64, 68,
         8,  1], device='cuda:0')
Embedd output
torch.Size([3068, 256])
tensor([-2.0149e+00, -6.0361e-01, -1.1960e+00, -2.2865e+00, -4.4859e-01,
         7.6630e-01, -8.6979e-02,  2.3454e+00,  1.0890e+00, -3.9619e-01,
        -7.1854e-01, -1.1675e+00,  9.4324e-01, -9.8904e-01,  2.3972e-02,
         4.3202e-01,  1.1914e+00, -2.0167e+00,  1.3723e+00,  1.7466e-01,
         3.8195e-01,  1.5554e+00,  1.3900e+00, -7.5800e-01, -1.0323e+00,
        -1.6727e+00,  4.4193e-01, -5.5694e-01,  3.9919e-02, -9.7039e-02,
         1.745

In [None]:
vocab_size = 84
embed_size = 256

embedding = nn.Embedding(vocab_size,embed_size)
embedding.to(DEVICE)

##split = 100000
##train_dataset = LinesDataset(shakespeare_lines[:split])

##print(train_dataset)


train_data_flat
train_labels_flat


train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_lines)

i = 0
for inputs,targets in train_loader:
    print(inputs[0].size())
    
    seq_list = inputs
    
    batch_size = len(seq_list)
    print("Batchsize")
    print(batch_size)
    
    lens = [len(s) for s in seq_list] # lens of all lines (already sorted)
    print("lens")
    print(lens)
    print(sum(lens))
    bounds = [0]
    for l in lens:
        bounds.append(bounds[-1]+l) # bounds of all lines in the concatenated sequence. Indexing into the list to 
        ##print(l)
        ##print(bounds[-1])  # see where the sequence occurs. Need this at line marked **
        
    print(len(seq_list))
    print(seq_list[0].size())
    seq_concat = torch.cat(seq_list) # concatenated sequence
    print(seq_concat.size())
    print(seq_concat[0:20])
    embed_concat = embedding(seq_concat) # concatenated embeddings
    print(embed_concat.size())
    print(embed_concat[0,:])
    
    embed_list = [embed_concat[bounds[i]:bounds[i+1]] for i in range(batch_size)]
    print(len(embed_list))
    print(embed_list[0].size())
    packed_input = rnn.pack_sequence(embed_list)
    print(len(packed_input))
    
    print(packed_input[0].size())
    print(packed_input[1].size())
    print(packed_input[2])
    print(packed_input[3])
    
    
    hidden = None
    
    rnn_x = nn.LSTM(input_size = embed_size,hidden_size=256,num_layers=3)
    rnn_x.to(DEVICE)
    output_packed,hidden = rnn_x(packed_input,hidden)
    
    print(len(output_packed))
    print(output_packed[0].size())
    print(output_packed[1].size())
    print(output_packed[2])
    print(output_packed[3])
    
    print(packed_input[0][0,:])
    print(packed_input[1][2])
    
    print(output_packed[0][0,:])
    print(output_packed[1][2])
    
    
    output_padded, _ = rnn.pad_packed_sequence(output_packed)
    print(output_padded.size())
    
    output_flatten = torch.cat([output_padded[:lens[i],i] for i in range(batch_size)])
    
    print(output_flatten.size())
    
    scoring = nn.Linear(256,vocab_size)
    
    scoring.to(DEVICE)
    
    scores_flatten = scoring(output_flatten)
    
    print(scores_flatten.size())
    
    
    
    i = i + 1
    if i == 1:
        break


In [64]:
# Model that takes packed sequences in training
class PackedLanguageModel(nn.Module):
    
    def __init__(self,vocab_size,embed_size,hidden_size, nlayers, stop):
        super(PackedLanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers) # 1 layer, batch_size = False
        self.scoring = nn.Linear(hidden_size,vocab_size)
        self.stop = stop # stop line character (\n)
    
    def forward(self,seq_list): # list
        batch_size = len(seq_list)
        lens = [len(s) for s in seq_list] # lens of all lines (already sorted)
        bounds = [0]
        for l in lens:
            bounds.append(bounds[-1]+l) # bounds of all lines in the concatenated sequence. Indexing into the list to 
                                        # see where the sequence occurs. Need this at line marked **
        seq_concat = torch.cat(seq_list) # concatenated sequence
        embed_concat = self.embedding(seq_concat) # concatenated embeddings
        embed_list = [embed_concat[bounds[i]:bounds[i+1]] for i in range(batch_size)] # embeddings per line **
        packed_input = rnn.pack_sequence(embed_list) # packed version
        
        # alternatively, you could use rnn.pad_sequence, followed by rnn.pack_padded_sequence
        
        
        
        hidden = None
        output_packed,hidden = self.rnn(packed_input,hidden)
        output_padded, _ = rnn.pad_packed_sequence(output_packed) # unpacked output (padded). Also gives you the lengths
        output_flatten = torch.cat([output_padded[:lens[i],i] for i in range(batch_size)]) # concatenated output
        scores_flatten = self.scoring(output_flatten) # concatenated logits
        return scores_flatten # return concatenated logits
    
    def generate(self,seq, n_words): # L x V
        generated_words = []
        embed = self.embedding(seq).unsqueeze(1) # L x 1 x E
        ##hidden = None
        output_lstm, hidden = self.rnn(embed)[0] # L x 1 x H
        output = output_lstm[-1] # 1 x H
        scores = self.scoring(output) # 1 x V
        _,current_word = torch.max(scores,dim=1) # 1 x 1
        generated_words.append(current_word)
        if n_words > 1:
            for i in range(n_words-1):
                embed = self.embedding(current_word).unsqueeze(0) # 1 x 1 x E
                output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
                output = output_lstm[0] # 1 x H
                scores = self.scoring(output) # V
                _,current_word = torch.max(scores,dim=1) # 1
                generated_words.append(current_word)
                if current_word[0].item()==self.stop: # If end of line
                    break
        return torch.cat(generated_words,dim=0)

In [65]:
def train_epoch_packed(model, optimizer, train_loader, val_loader):
    criterion = nn.CrossEntropyLoss(reduction="sum") # sum instead of averaging, to take into account the different lengths
    criterion = criterion.to(DEVICE)
    batch_id=0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    for inputs,targets in train_loader: # lists, presorted, preloaded on GPU
        batch_id+=1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets)) # criterion of the concatenated output
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_id % 100 == 0:
            after = time.time()
            nwords = np.sum(np.array([len(l) for l in inputs]))
            lpw = loss.item() / nwords
            print("Time elapsed: ", after - before)
            print("At batch",batch_id)
            print("Training loss per word:",lpw)
            print("Training perplexity :",np.exp(lpw))
            before = after
    
    val_loss = 0
    batch_id=0
    nwords = 0
    for inputs,targets in val_loader:
        nwords += np.sum(np.array([len(l) for l in inputs]))
        batch_id+=1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets))
        val_loss+=loss.item()
    val_lpw = val_loss / nwords
    print("\nValidation loss per word:",val_lpw)
    print("Validation perplexity :",np.exp(val_lpw),"\n")
    return val_lpw

In [66]:
model = PackedLanguageModel(charcount,256,256,3, stop=stop_character)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
split = 100000
train_dataset = LinesDataset(shakespeare_lines[:split])
val_dataset = LinesDataset(shakespeare_lines[split:])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_lines)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate_lines, drop_last=True)

In [67]:
for i in range(20):
    train_epoch_packed(model, optimizer, train_loader, val_loader)

Training 1563 number of batches
Time elapsed:  4.3084876537323
At batch 100
Training loss per word: 2.8123067679753064
Training perplexity : 16.64827767237601
Time elapsed:  4.242072105407715
At batch 200
Training loss per word: 2.221636252428489
Training perplexity : 9.222408715591532


KeyboardInterrupt: 

In [0]:
torch.save(model, "trained_model.pt")

In [0]:
print(generate(model, "To be, or not to be, that is the q",20))

In [0]:
print(generate(model, "Richard ", 1000))

In [0]:
print(generate(model, "Hello", 1000))

### Reminders

By default, for all rnn modules (rnn, GRU, LSTM) batch_first = False
To use packed sequences, your inputs first need to be sorted in descending order of length (longest to shortest)
Batches need to have inputs of the same length 