In [1]:
# Import the dependency libraries
import string
from collections import Counter
import re     # Regular expression
import numpy as np
import nltk

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
# Download the dataset
!wget https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz

# unzip the .tgz file
!tar -xvf  'wikitext-2.tgz'

#Load the train dataset
with open('/content/wikitext-2/train.csv') as f:
    lines = f.readlines()

--2021-09-26 05:37:51--  https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.198.152
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.198.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4070055 (3.9M) [application/x-tar]
Saving to: ‘wikitext-2.tgz.1’


2021-09-26 05:37:53 (2.63 MB/s) - ‘wikitext-2.tgz.1’ saved [4070055/4070055]

wikitext-2/
wikitext-2/train.csv
wikitext-2/test.csv


In [3]:
lines[:4]

['" \n',
 ' = 2013 – 14 York City F.C. season = \n',
 ' \n',
 ' The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n']

In [4]:
class Dataset(Dataset):
   
    def __init__(self,sequence_length,text_data):
        self.sequence_length = sequence_length
        self.text_data = text_data
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        
        #map the words to the unique words and get the index values
        self.words_indexes = [self.word_to_index[w] for w in self.words]

    # Text cleaning
    def load_words(self):
        # Remove trailing / leading special characters from strings list
        string_list = list(map(lambda s: s.strip(), self.text_data))

        #join the each sentance
        data = [''.join(string_list)]

        #lower casing text
        string_list = [each_string.lower() for each_string in data]

        # Drop digits and special characters in text
        string_list = [re.sub(r'[^a-zA-Z]',' ',string) for string in string_list]

        # remove white space in text
        string_list = [" ".join(i.split()) for i in string_list]
        for word in string_list: pass
         
        # returns all the words in dataset
        return word.split()
         
    def get_uniq_words(self):
        #Counter() method returns unique words and its count 
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)
     
    #Function which returns the size of the dataset
    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    #function which returns a sample(in this case X and y) from the dataset given an index.
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1])
        )

In [5]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128     # Number of time steps for lstm layer
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,            # number of unique words
            embedding_dim=self.embedding_dim   # the size of each embedding vector/word
        )
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,  #The number of expected features(embedding size) in the input x
            hidden_size=self.lstm_size, #The number of features in the hidden state h
            num_layers=self.num_layers, # Number of LSTM layers
            dropout=0.2
        )

        #nn.Linear is a function that takes the number of input and output features as parameters and prepares the necessary matrices for forward propagation.
        self.fc = nn.Linear(self.lstm_size, n_vocab) 

    def forward(self, x, prev_state):
        # Initially we pass given input to the embedding layer to get emeddings for given input
        embed = self.embedding(x)

        # now this embeddings has been passed to Lstm layer which returns hidden state and output of current state
        output, state = self.lstm(embed, prev_state)

        # And we pass this output to fully connected liner activation layer to prdict the final output
        logits = self.fc(output)
        return logits, state

    # initilize the initial hidden state and cell state
    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [6]:
# function to train the model
def train(dataset, model, batch_size, max_epochs, sequence_length ):

    #Set the module in Training mode(This is equivalent with self.train(True))
    model.train()

    # Instantiate the data loader
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Instantiate type of loss for model
    criterion = nn.CrossEntropyLoss()

    # Instantiate optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        # initilize the initial hidden state and cell state for each initial epoch
        state_h, state_c = model.init_state(sequence_length)

        # loop all batches for each epoch
        for batch, (x, y) in enumerate(dataloader):

            # we need to set the gradients to zero before starting to do backpropragation
            optimizer.zero_grad()
            
            # get the predicted output for each batch 
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            
            #calculate the loss using CrossEntropyLoss
            loss = criterion(y_pred.transpose(1, 2), y)

            #The hidden state in an LSTM is suppose to serve as “memory”. We start off with an initial hidden state, but this hidden state isn’t 
            #suppose to be learned, so we detach it to let the model use those values but to not compute gradients w.r.t.
            state_h = state_h.detach()
            state_c = state_c.detach()

            # backpropagate the loss(computes dloss/dx for every parameter x)
            loss.backward()

            # perform a parameter update based on the current gradient
            optimizer.step()
            
            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

In [17]:
#function to predict unseen text using trained model
def predict(dataset, model, text, next_words=50):

    #Set the module in evaluation mode(This is equivalent with self.train(False))
    model.eval()
    
    #split the given text
    words = text.split(' ')

    # initilize the initilize hidden state and cell state
    state_h, state_c = model.init_state(len(words))

    #loop the model to predict given number of next words
    for i in range(0, next_words):

        #get index values for given text by it to existing corpus
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])

        #predict the next word using trained model
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        
        # get the last network output
        last_word_logits = y_pred[0][-1]

        # applying softmax on last network output
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        
        # select the highest probability of word index value
        word_index = np.random.choice(len(last_word_logits), p=p)

        #get the word by mapping index value to the corpus and append it to the words list(x_data) to predict the next word
        words.append(dataset.index_to_word[word_index])

    return words

In [8]:
sequence_length= 5 # sequence_length
text_data = lines[:1000]   #text_data
batch_size= 200   # batch_size
max_epochs = 15   #max_epochs

In [9]:
#Instantiate the dataset class
dataset = Dataset(sequence_length, text_data)

In [10]:
print('There are total {} unique numbers of words in corpus.'.format(len(dataset)))

There are total 45699 unique numbers of words in corpus.


In [11]:
# Call the DataLoader method from pytorch 
# Dataloader has been used to parallelize the data loading as this boosts up the speed and saves memory.
dataloader = DataLoader(dataset, batch_size)

In [12]:
# View samples from dataset 
for batch, (x, y) in enumerate(dataloader):
  print(batch)
  print(x)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [ 177,    0, 2223, 6251,    2],
        [   0, 2223, 6251,    2,   31],
        [2223, 6251,    2,   31,   62],
        [6251,    2,   31,   62,    3],
        [   2,   31,   62,    3,    0],
        [  31,   62,    3,    0,  606],
        [  62,    3,    0,  606, 2419],
        [   3,    0,  606, 2419,  479],
        [   0,  606, 2419,  479,    0],
        [ 606, 2419,  479,    0,   29],
        [2419,  479,    0,   29, 6252],
        [ 479,    0,   29, 6252, 6253],
        [   0,   29, 6252, 6253,    2],
        [  29, 6252, 6253,    2,    0],
        [6252, 6253,    2,    0, 2123],
        [6253,    2,    0, 2123,    2],
        [   2,    0, 2123,    2,  737],
        [   0, 2123,    2,  737,  205],
        [2123,    2,  737,  205,    3],
        [   2,  737,  205,    3, 1415],
        [ 737,  205,    3, 1415,    3],
        [ 205,    3, 1415,    3,  596],
        [   3, 1415,    3,  596,  562],
        [1415, 

There are 228 Batches and every batch contains 200 samples.

In [13]:
#Instatntiate the model
model = Model(dataset)

In [14]:
# Train the Model
train(dataset, model, batch_size, max_epochs, sequence_length)

{'epoch': 0, 'batch': 0, 'loss': 8.790940284729004}
{'epoch': 0, 'batch': 1, 'loss': 8.787612915039062}
{'epoch': 0, 'batch': 2, 'loss': 8.77902889251709}
{'epoch': 0, 'batch': 3, 'loss': 8.770212173461914}
{'epoch': 0, 'batch': 4, 'loss': 8.750324249267578}
{'epoch': 0, 'batch': 5, 'loss': 8.738950729370117}
{'epoch': 0, 'batch': 6, 'loss': 8.733614921569824}
{'epoch': 0, 'batch': 7, 'loss': 8.681722640991211}
{'epoch': 0, 'batch': 8, 'loss': 8.646406173706055}
{'epoch': 0, 'batch': 9, 'loss': 8.552362442016602}
{'epoch': 0, 'batch': 10, 'loss': 8.406164169311523}
{'epoch': 0, 'batch': 11, 'loss': 8.29684829711914}
{'epoch': 0, 'batch': 12, 'loss': 7.970916271209717}
{'epoch': 0, 'batch': 13, 'loss': 7.782753944396973}
{'epoch': 0, 'batch': 14, 'loss': 7.735495090484619}
{'epoch': 0, 'batch': 15, 'loss': 7.872032642364502}
{'epoch': 0, 'batch': 16, 'loss': 7.899075031280518}
{'epoch': 0, 'batch': 17, 'loss': 7.833868980407715}
{'epoch': 0, 'batch': 18, 'loss': 7.832111358642578}
{'epo

In [18]:
# Predict next 50 words by using trained model
print(predict(dataset, model, text='competitive association football'))

['competitive', 'association', 'football', 'originated', 'into', 'water', 'the', 'unk', 'unk', 'is', 'various', 'sorraia', 'for', 'the', 'first', 'early', 'and', 'watched', 'gold', 'have', 'miranic', 'had', 'the', 'mengden', 'and', 'dalmeny', 'bay', 'that', 'clinical', 'gone', 'was', 'in', 'the', 'treaty', 'in', 'the', 'club', 'screening', 'brown', 'bugs', 'show', 'in', 'artists', 'to', 't', 'dark', 'occurring', 'that', 'the', 'majority', 'on', 'the', 'years']


## Conclusion:
* Here we created LSTM model which is trained on "York City Football Club"(By using some portion of dataset) wikipedia text data. And by using this model we can predict next word for a given sentance.
* As more data we can train the model which will reevaluate the weights to understand the core features of paragraphs/sentences to predict good results.
* To summarize, in this project, we covered a lot of things related to Natural Language Generation(NLG) such as dataset preparation, how a neural language model is trained, and finally Natural Language Generation process in PyTorch.