# build data pipline

# Seq to Seq

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import random
import io
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

## Load data

In [2]:
from torchtext.utils import extract_archive
path = '/home/sharma/Desktop/DeepLearning/Testing/Datasets/multi30k-dataset/data/task1/raw/'
train_files = ('train.de.gz', 'train.en.gz')
val_files = ('val.de.gz', 'val.en.gz')
test_files = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(path + file)[0] for file in train_files]
val_filepaths = [extract_archive(path + file)[0] for file in val_files]
test_filepaths = [extract_archive(path + file)[0] for file in test_files]

In [3]:
from torch.nn.utils.rnn import pad_sequence # for padding batch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
import io # for io.open
import torch

def build_vocab(filepath, tokenizer, min_freq):
    counter = Counter()
    with io.open(filepath) as f:
        for s_ in f:
            counter.update(tokenizer(s_))
    return Vocab(counter, specials=['<UNK>', '<PAD>', '<BOS>', '<EOS>'], min_freq=min_freq)



class TextDataset(Dataset):
    def __init__(self, pathfilename, Vocabulary, tokenizer, freq_threshold=5):
        self.path = path
        self.tokenizer = tokenizer
        self.txt = open(pathfilename, 'r').read().split('\n')
        self.vocab = Vocabulary
        
    def __len__(self):
        return len(self.txt)
    

    def textnumericalizer(self, text):
        numerical_tok = self.tokenizer(text.lower())
        numerical_sen = []
        for tok in numerical_tok:
            if tok not in self.vocab.stoi:
                self.vocab.stoi[tok] = 0
            numerical_sen.append(self.vocab.stoi[tok])
        return numerical_sen
    
    # get a numeralized and format sentence
    # as "es ist ein Ei" -> tensor([2, 439, 72, 16, 0, 3])
    def __getitem__(self, batch_idx):
        sentence = self.txt[batch_idx]
        sen_format = [self.vocab['<BOS>']]
        sen_format += self.textnumericalizer(sentence)
        sen_format.append(self.vocab['<EOS>'])

        return torch.tensor(sen_format)
    
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        targets = []
        for idx in batch:
            targets.append(idx)
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return targets
    
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [12]:
de_train_voc = build_vocab(train_filepaths[0], de_tokenizer, 2)
en_train_voc = build_vocab(train_filepaths[1], en_tokenizer, 2)
de_val_voc = build_vocab(val_filepaths[0], de_tokenizer, 2)
en_val_voc = build_vocab(val_filepaths[1], en_tokenizer, 2)
de_test_voc = build_vocab(test_filepaths[0], de_tokenizer, 2)
en_test_voc = build_vocab(test_filepaths[1], en_tokenizer, 2)

train_de_dataset = TextDataset(train_filepaths[0], de_test_voc, de_tokenizer)
train_en_dataset = TextDataset(train_filepaths[1], en_test_voc, en_tokenizer)
val_de_dataset = TextDataset(val_filepaths[0], de_test_voc, de_tokenizer)
val_en_dataset = TextDataset(val_filepaths[1], en_test_voc, en_tokenizer)
test_de_dataset = TextDataset(test_filepaths[0], de_test_voc, de_tokenizer)
test_en_dataset = TextDataset(test_filepaths[1], en_test_voc, en_tokenizer)


batch_size = 64
pad_idx = test_de_dataset.vocab.stoi['<PAD>']

train_en_loader = DataLoader(dataset=train_en_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                    )
train_de_loader = DataLoader(dataset=train_de_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                   )

val_en_loader = DataLoader(dataset=val_en_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                    )
val_de_loader = DataLoader(dataset=val_de_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                   )

text_en_loader = DataLoader(dataset=test_en_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                    )
text_de_loader = DataLoader(dataset=test_de_dataset, 
                    batch_size=batch_size, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                   )
#print(next(iter(train_de_loader)).shape)

## model

In [113]:
class Encoder(nn.Module):
    def __init__(self, num_batch, seq_len, embed_dim, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(seq_len, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, dropout = p, batch_first=False)
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        # x: S, N
        x_hat = self.drop(self.embed(x))
        # x_hat: S, N, E
        h_t, (h_f, c0) = self.lstm(x_hat)
        # h_t, h0, c0: S, N, hidden_size
        return h_t, h_f, c0
                
    
    
class Decoder(nn.Module):
    def __init__(self, num_batch, seq_len, embed_dim, hidden_size, num_class, num_layers, p):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(seq_len, embed_dim)
        self.drop = nn.Dropout(p)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, dropout=p, batch_first=False)
        self.linear = nn.Linear(hidden_size, num_class)
        
    def forward(self, x, h0, c0):
        # x: 1, N. means every time feed into one word of N batchs

        x = x.unsqueeze(0)

        x_t = self.drop(self.embed(x))

        # x_t: 1, N, E
        h_t, (h_f, c0) = self.lstm(x_t, (h0, c0))
        # h_t, h0, c0: 1, N, hidden_size
        #x_t, h0, c0 = h_t, hn, cn
        out = self.linear(h_t)

        return out, h_f, c0
        
        
        
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, tgt, teacher_force_ratio=0.5):
        src_len, num_batch = src.shape
        tgt_len = tgt.shape[0]
        tgt_vocab_size = len(en_train_voc)
        
        pred = torch.zeros(tgt_len, num_batch, tgt_vocab_size, requires_grad=True).to(device)
        
        h_t, h_f, c0 = self.encoder(src)
        
        # grab start token
        x = tgt[0]
        
        for t in range(1, tgt_len):

            output, h_f, c0 = self.decoder(x, h_f, c0)
            # 1, N, num_class
            
            best_pred = output.argmax(dim=2)
            best_pred = torch.squeeze(best_pred)
#             print('bestpre', best_pred.shape)
#             print('tgt_t', tgt[t].shape)
            
            x = tgt[t] if random.random() < teacher_force_ratio else best_pred
            
        return pred

## Training

In [114]:
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
# training hyperparameters
num_epoch = 20
lr = 0.001
#batch_size=64

# Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seq_len_encoder = len(de_train_voc)
seq_len_decoder = len(en_train_voc)
output_size = len(en_train_voc)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_drop = 0.5
dec_drop = 0.5

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0


my_encoder = Encoder(batch_size, seq_len_encoder, encoder_embedding_size, hidden_size, num_layers, enc_drop)
my_decoder = Decoder(batch_size, seq_len_decoder, decoder_embedding_size, hidden_size, 
                     output_size, num_layers, dec_drop)

model = Seq2Seq(my_encoder, my_decoder)

optimizer = optim.Adam(model.parameters(), lr)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."


In [117]:
for epoch in range(num_epoch):
    
    print(f"[Epoch {epoch} / {num_epoch}]")

#     model.eval()

#     translated_sentence = translate_sentence(
#         model, sentence, german, english, device, max_length=50
#     )

#     print(f"Translated example sentence: \n {translated_sentence}")
    
    
    model.train()
    for batch_idx, sen_batch in enumerate(zip(train_de_loader, train_en_loader)):
        source_sentence = sen_batch[0]
        target_sentence = sen_batch[1]
        #print(target_sentence.shape)
        
        # Forward
        score = model(source_sentence, target_sentence)
        score = score[1:].reshape(-1, score.shape[2])
        target_sentence = target_sentence[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(score, target_sentence)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

[Epoch 0 / 20]


KeyboardInterrupt: 

## Translation (Prediction)

+ tokenizer
"Es ist ein Er." -> "es ist ein er ."
+ numericalizer and format
es ist ein er -> [0, 1, 23, 345, 456, 789, 1]
+ put into trained model

In [None]:
import spacy
from torchtext.data.utils import get_tokenizer
from collections import Counter
import torch.data.vocab import Vocab

def translation(model, sentence):
    en_dictionary = spacy('en_core_webs_sm')
    de_tokenizer = get_tokenizer('spacy', 'en')
    counter = Counter()
    for _s in sentence:
        counter.update(de_tokenizer(_s))
    Vocab(counter, specialize=)
    
    numericalize_sentence = ['<BOS>']
    numericalize_sentence += Vocab.stoi(sentence)
    numericalize_sentence.append('<EOS>')
    
    model(numericalize_sentence)


In [121]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de_core_news_sm")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


    
translate_sentence(model, sentence, german, english, device, max_length=50)

NameError: name 'german' is not defined