## Seq2Seq with LSTM 
1. tokenize input data and Do a train test split
2. Build Vocab on training data
3. Build Encoder module
4. Build Decoder module
5. Seq2Seq module (which uses both encoder and decoder)

- Reference : 
    - https://arxiv.org/pdf/1409.3215.pdf
    - https://github.com/pytorch/text/blob/release/0.9/examples/legacy_tutorial/migration_tutorial.ipynb


In [390]:
import torch
import torch.nn as nn
import torch.optim as optim
# from torchtext.legacy import Field, BucketIterator

# !pip install torch==1.8.0 torchtext==0.9.0
from torchtext.datasets import IMDB,IWSLT2016
from torchtext.legacy import data
from torchtext.legacy import datasets

import numpy as np
import spacy
import random


from torch.utils.tensorboard import SummaryWriter # to print to tensorboard

## Save_checkPoints and Load_checkpoints

In [391]:
def save_checkpoint(state, file_name = "mycheckpoint.pth.ptar"):
    print("Saving checkpoints --->\n")
    torch.save(state,file_name)
    

def load_checkpoint(model, optimizer, checkpoint):
    print("Loading checkpoint --->\n")
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])


"""

# ................... How to use .................
# checkpoint = {"state_dict":model.state_dict(),
#               "optimizer": optimizer.state_dict()
#               }

# save_checkpoint(checkpoint)

# load_checkpoint(model, optimizer, checkpoint)

"""

'\n\n# ................... How to use .................\n# checkpoint = {"state_dict":model.state_dict(),\n#               "optimizer": optimizer.state_dict()\n#               }\n\n# save_checkpoint(checkpoint)\n\n# load_checkpoint(model, optimizer, checkpoint)\n\n'

## pre-processing
1. Train/validate/test split: generate train/validate/test data set if they are available
2. Tokenization: break a raw text string sentence into a list of words
3. Vocab: define a "contract" from tokens to indexes
4. Numericalize: convert a list of tokens to the corresponding indexes
5. Batch: generate batches of data samples and add padding if necessary

In [392]:
train_iter, validate_iter, test_iter = IWSLT2016(split=('train', 'valid', 'test'))

RuntimeError: Internal error: confirm_token was not found in Google drive link.

In [382]:
# train_iter, test_iter = IMDB(split=('train', 'test'))

- Load data and do train-validate-test split
- Build Vocabulary

In [383]:
for a,b in enumerate(train_iter):
    print(a)
    print(b)
    break
    

0
('Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', ['Saint Bernadette Soubirous'], [515])


In [384]:
print(next(iter(train_iter)))

('Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'What is in front of the Notre Dame Main Building?', ['a copper statue of Christ'], [188])


In [361]:
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')
# train_iter = IMDB(split='train')
counter = Counter()
for (label, line) in train_iter:
    # print(label)
    # print(line)
    counter.update(tokenizer(line))
    
vocab = Vocab(counter, min_freq=10, specials=('<unk>', '<pad>', '<BOS>', '<EOS>'))

In [362]:
print(vocab.stoi['<unk>'])
print(vocab.stoi['<pad>'])
print(vocab.stoi['<BOS>'])
print(vocab.stoi['<EOS>'])

0
1
2
3


In [363]:
# tokenizer("here is an example")
print("The length of the new vocab is", len(vocab))

'''
idx = vocab.stoi
print(idx)

new_stoi = vocab.stoi
print("The index of '' is", new_stoi['the'])
'''

text_transform = lambda text: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(text)] + [vocab['<EOS>']]
# pos = 1 and neg = 0
label_transform = lambda lable: 1 if lable == 'pos' else 0


print("output of the text_transform:", text_transform("here is an example"))
print("result : ", label_transform("neg"))


The length of the new vocab is 20437
output of the text_transform: [2, 134, 12, 43, 467, 3]
result :  0


In [364]:
# ---------------------------------- OLD LEGEACY CODE ---------------------------------------

# spacy_german = spacy.load('de')
# spacy_english = spacy.load('en')

# def tokenizer_german(text):
#     return [token.text for token in spacy_german.tokenizer(text)]

# def tokenizer_english(text):
#     return [token.text for token in spacy_english.tokenizer(text)]

# # <sos> : start of sentence
# german = Field(tokenizer = tokenizer_german, lower = True, init_token = '<sos>', eos_token = '<eos>')
# english = Field(tokenizer = tokenizer_english, lower = True, init_token = '<sos>', eos_token = '<eos>')

# train_data, validate_data, Test_data = Multi30k.splits(exts = ('.de','.en'), fields =(german, english))
# # Build Vocab
# german.build_vocab(train_data, max_size = 10000, min_freq = 2)
# english.build_vocab(train_data, max_size = 10000, min_freq = 2)

## Encoder

In [365]:
# input_size : size of vocabulary
# embedding_side : each is mapped to some d-dimenssional space
class Encoder(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, number_of_layers,dropout):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.number_of_layers = number_of_layers

        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size,embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, number_of_layers, dropout = dropout)
    
# x: vector of indices for a given input sentence
    def forward(self, x):
    # x(shape) : sequence_length x N(batch size) 
        
        embedding = self.dropout(emb = self.embedding(x))
        # embedding_shape : (sequence_length, N, embedding_size)
        # for each word we can have mapping to some d-dimenssion space ; d is size of embedding vector
        output, (hiddden, cell) = self.rnn(embedding)
        return hiddden, cell

## Decoder

In [366]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, number_of_layers, dropout):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.number_of_layer = number_of_layers

        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, number_of_layers, dropout=dropout)
        self.fully_connected = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden, cell):
        # Shape of x : N but we want (1,N); we will get one token as output and we want to pass it as next input so....
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))                 # embedding shape : (1, N, embedding_size)
        output, (hidden,cell) = self.rnn(embedding, (hidden,cell))  # shape of output : (1, N, hidden_size)
        # predictions = self.fully_connected(output)                  # shape of output : (1, N, length_of_vocab)
        output = self.softmax(self.fully_connected(output[0]))
        
        # prediction = predictions.squeeze(0)
        # shape : (N, length_of_vocab)
        return output, hidden, cell


## Seq2Seq

In [367]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source_sentence, target_setence,teacher_force_ration=0.5):
        batch_size = source_sentence[1]
        target_len = target_setence[0]
        target_vocab_size = len(vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size)

        hidden, cell = self.encoder(source_sentence)
        
        # grab start token
        x = target_setence[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x,hidden,cell)
            outputs[t] = output

            # output shape: (N, target_vocab_size)

            best_guess = output.argmax(1)

            x = target_setence[t] if random.random() < teacher_force_ration else best_guess

        return outputs



In [368]:
train_iter, test_iter = IMDB(split=('train', 'test'))
train_list = list(train_iter)

In [369]:
print(len(train_list))
print((train_list[1]))

25000
('neg', '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears n

### create batch and each batch has similer size of input

In [370]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import random

batch_size = 64

# process the raw text data and add padding to dynamically match the longest sentence in a batch
def collate_batch(batch_of_data):
    label_list, text_list = [], []
    for _label, _text in batch_of_data:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

# # batch_size = 8  # A batch size of 8

# def batch_sampler():
#     indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_list)]
#     # print("indices :\n",indices)
#     random.shuffle(indices)
#     pooled_indices = []
#     # create pool of indices with similar lengths 
#     for i in range(0, len(indices), batch_size * 100):
#         pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

#     # print("pooled_indices :\n",pooled_indices)
#     # p(pooled_indices)
#     pooled_indices = [x[0] for x in pooled_indices]

#     # yield indices for current batch
#     for i in range(0, len(pooled_indices), batch_size):
#         yield pooled_indices[i:i + batch_size]


# train_Batch_iter = DataLoader(train_list, batch_sampler=batch_sampler(),collate_fn=collate_batch)
train_Batch_iter = DataLoader(train_list, batch_size=batch_size, collate_fn=collate_batch)

In [371]:
# for l,d in train_Batch_iter:
#     print(l[0])
#     print(d[1])
    

In [372]:
# counter = 0
# for a,b in enumerate(train_Batch_iter):
#     if counter == 2:
#         break
#     print(a)
#     print(b[0])
#     print(b[1])
#     counter+=1

In [373]:
train_iter, test_iter = IMDB(split=('train', 'test'))
train_list = list(train_iter)
test_list = list(test_iter)

print(train_list.__len__())
print(test_list.__len__())

25000
25000


In [374]:
# training hyperparameters
num_epochs = 5
learning_rate = 0.01
batch_size = 64

# model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(vocab)     # len(german.vocab)
input_size_decoder = len(vocab)     # len(english.vocab)

output_size = len(vocab)            # len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
number_of_layer = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

# TensortBoard
writer = SummaryWriter(f"runs/loss_plot")

step = 0
# train_iterator, validation_iterator, test_iterator = BucketIterator.split(
#     batch_size = batch_size,
#     sort_within_batch = True,
#     sort_key = lambda x:len(x.src),
#     device = device)

# train_iter, test_iter = IMDB(split=('train', 'test'))

# train_iterator = DataLoader(list(train_iter),batch_sampler=batch_sampler(),collate_fn=collate_batch)
# test_iterator = DataLoader(list(test_iter),batch_sampler=batch_sampler(),collate_fn=collate_batch)

# input_size, embedding_size, hidden_size, number_of_layers,dropout
encoder_network = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, number_of_layer, encoder_dropout).to(device)
decoder_network = Decoder(input_size_encoder, decoder_embedding_size, hidden_size, output_size, number_of_layer, decoder_dropout).to(device)

model = Seq2Seq(encoder_network, decoder_network).to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

pad_idx = vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [375]:
# list(train_iterator)

In [377]:
# have to implement it seprately
# from utils import translate_sentence, blue, save_checkpoint, load_checkpoint
if load_model:
    load_checkpoint(model, optimizer, torch.load('mycheckpoint.pth.ptar'),model,optimizer)

for epoch in range(num_epochs):
    print(f"Epoch [{epoch}/{num_epochs}]")

    checkpoint = {'static_dict' : model.state_dict(), 'optimizer':optimizer.state_dict()}
    save_checkpoint(checkpoint)

    for batch_idx, (lable, text) in enumerate(train_Batch_iter):
        # inp_data = batch.text.to(device)
        # target = batch.labal.to(device)
        inp_data = text
        target = label

        output = model(inp_data, target)
        # output_shape : (trg_len, batch_size, output_dim)

        output = output[1:].reshape(-1,output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output,target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
        optimizer.step()

        writer.add_scalar('Train loss',loss, global_step=step)
        step+=1

Epoch [0/5]
Saving checkpoints --->

pme tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Here you go tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  15,   15,   53,  ...,   15,   15,   16],
        [1573,  248,   72,  ...,  484,   99,   23],
        ...,
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3],
        [   3,    3,    3,  ...,    3,    3,    3]])


TypeError: zeros() received an invalid combination of arguments - got (str, Tensor, int), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


## Attention