In [2]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import torch
import torch.nn as nn
from collections import Counter
import warnings
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
batch_size = 16
seq_size = 32
embedding_size = 64
lstm_size = 64
gradients_norm = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
with open ('Shakespeare_data.csv', 'r') as f:
    doc = f.read()

# Data preparation

Function to convert the all dataset into a list of words. We remove undesired characters.

In [5]:

def doc2words(doc):
    lines = doc.split('\n')
    lines = [line.strip(r'\"') for line in lines]
    words = ' '.join(lines).split()
    return words

Function that is defined to remove all the punctuation characters.

In [6]:
def removepunct(words):
    punct = set(string.punctuation)
    words = [''.join([char for char in list(word) if char not in punct]) for word in words]
    return words

Function to build a vocabulary of each word considering the number of occurrencies of that word.

In [7]:
# get vocab from word list
def getvocab(words):
    wordfreq = Counter(words)
    sorted_wordfreq = sorted(wordfreq, key=wordfreq.get)
    return sorted_wordfreq

To simplify the training we create 2 dictionaies that map each word to a distinct number and viceversa.

In [8]:

def vocab_map(vocab):
    int_to_vocab = {k:w for k,w in enumerate(vocab)}
    vocab_to_int = {w:k for k,w in int_to_vocab.items()}
    return int_to_vocab, vocab_to_int

Let's use these functions for our dataset:

In [9]:
words = removepunct(doc2words(doc))
vocab = getvocab(words)
int_to_vocab, vocab_to_int = vocab_map(vocab)

Function to create batches of sequences:

In [10]:
def get_batches(words, vocab_to_int, batch_size, seq_size):
    # generate a Xs and Ys of shape (batchsize * num_batches) * seq_size
    word_ints = [vocab_to_int[word] for word in words]
    num_batches = int(len(word_ints) / (batch_size * seq_size))
    Xs = word_ints[:num_batches*batch_size*seq_size]
    Ys = np.zeros_like(Xs)
    Ys[:-1] = Xs[1:]
    Ys[-1] = Xs[0]
    Xs = np.reshape(Xs, (num_batches*batch_size, seq_size))
    Ys= np.reshape(Ys, (num_batches*batch_size, seq_size))

    # iterate over rows of Xs and Ys to generate batches
    for i in range(0, num_batches*batch_size, batch_size):
        yield Xs[i:i+batch_size, :], Ys[i:i+batch_size, :]

# Model

In [11]:
class RNNModule(nn.Module):
    # initialize RNN module
    def __init__(self, n_vocab, seq_size=32, embedding_size=64, lstm_size=64):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        #the first parameter is the total number of words inside the vocabulary
        #the second is the length of the vector for each instance

        #in practice each batch get converted into a dense representation of an
        #embedding
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)


    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state

    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),torch.zeros(1, batch_size, self.lstm_size))

In [12]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [13]:
def generate_text(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

Train

In [20]:
def train_rnn(words, vocab_to_int, int_to_vocab, n_vocab):
    print("training started")
    # RNN instance
    net = RNNModule(n_vocab, seq_size, embedding_size, lstm_size)
    net = net.to(device)
    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0
    n_epochs = 5
    for e in range(n_epochs):
        batches = get_batches(words, vocab_to_int, batch_size, seq_size)
        print("batches taken")
        state_h, state_c = net.zero_state(batch_size)

        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1

            # Tell it we are in training mode
            net.train()

            # Reset all gradients
            optimizer.zero_grad()

            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()

            # Perform back-propagation
            loss.backward(retain_graph=True)

            _ = torch.nn.utils.clip_grad_norm_(net.parameters(), gradients_norm)

            # Update the network's parameters
            optimizer.step()

            print('Epoch: {}/{}'.format(e, n_epochs),'Iteration: {}'.format(iteration),'Loss: {}'.format(loss_value))

            # if iteration % 1000 == 0:
                # predict(device, net, flags.initial_words, n_vocab,vocab_to_int, int_to_vocab, top_k=5)
                # torch.save(net.state_dict(),'checkpoint_pt/model-{}.pth'.format(iteration))

    return net

In [None]:
rnn_net = train_rnn(words, vocab_to_int, int_to_vocab, len(vocab))

training started
batches taken
Epoch: 0/200 Iteration: 1 Loss: 11.78191089630127
Epoch: 0/200 Iteration: 2 Loss: 11.76327133178711
Epoch: 0/200 Iteration: 3 Loss: 11.729771614074707
Epoch: 0/200 Iteration: 4 Loss: 11.669238090515137
Epoch: 0/200 Iteration: 5 Loss: 11.574382781982422
Epoch: 0/200 Iteration: 6 Loss: 11.313254356384277
Epoch: 0/200 Iteration: 7 Loss: 10.88914966583252
Epoch: 0/200 Iteration: 8 Loss: 10.259749412536621
Epoch: 0/200 Iteration: 9 Loss: 9.778647422790527
Epoch: 0/200 Iteration: 10 Loss: 9.61694049835205
Epoch: 0/200 Iteration: 11 Loss: 9.441267013549805
Epoch: 0/200 Iteration: 12 Loss: 9.556185722351074
Epoch: 0/200 Iteration: 13 Loss: 9.880626678466797
Epoch: 0/200 Iteration: 14 Loss: 9.302845001220703
Epoch: 0/200 Iteration: 15 Loss: 9.583809852600098
Epoch: 0/200 Iteration: 16 Loss: 9.747956275939941
Epoch: 0/200 Iteration: 17 Loss: 9.978289604187012
Epoch: 0/200 Iteration: 18 Loss: 9.799871444702148
Epoch: 0/200 Iteration: 19 Loss: 10.061375617980957
Epoc