In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
import spacy
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torchtext.data import Dataset


In [2]:
# Load spaCy models for German and English
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")


In [3]:
# making tokenizer function 

text = "I am a handsome guy"

def tok_english(text):
     return [tok.text for tok in spacy_eng.tokenizer(text)]

def tok_german(text):
     return [tok.text for tok in spacy_ger.tokenizer(text)]

In [4]:
# Testing the tokenizer function
print(f"eng text is: {tok_english(text)}")
ger= tok_german("Ceci est un texte d' exemple.")
print(f"german text is: {ger}")

eng text is: ['I', 'am', 'a', 'handsome', 'guy']
german text is: ['Ceci', 'est', 'un', 'texte', "d'", 'exemple', '.']


In [5]:
# #using field for structuring all the given datas into lower case, star of sentences and end of sentences

# english = Field(tokenize=tok_english, lower=True, init_token="<sos>", eos_token="<eos>")
# german = Field(tokenize=tok_german, lower=True, init_token="<sos>", eos_token="<eos>")

In [6]:
# #loading the data from our datasets into our training part

# english_train_data, = TabularDataset.splits(
#     path="./datasets/train",
#     train='eng.en',
#     format='tsv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('english',english)]
# )
# german_train_data, = TabularDataset.splits(
#     path="./datasets/train/",
#     test='ger.de',  # Specify the English file name for the test dataset
#     format='csv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('german', german)]
# )

# #for testing part
# english_test_data, = TabularDataset.splits(
#     path="./datasets/test/",
#     test='eng.en',  # Specify the English file name for the test dataset
#     format='csv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('english',english)])

# german_test_data, = TabularDataset.splits(
#     path="./datasets/test/",
#     test='ger.de',  # Specify the English file name for the test dataset
#     format='csv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('german', german)]
# )

# #for validation part
# english_validation_data, = TabularDataset.splits(
#     path="./datasets/valid/",
#     test='eng.en',  # Specify the English file name for the test dataset
#     format='csv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('english', english)]
# )
# german_validation_data, = TabularDataset.splits(
#     path="./datasets/valid/",
#     test='ger.de',  # Specify the English file name for the test dataset
#     format='csv',  # You can set format to 'csv' or 'tsv', it doesn't matter
#     fields=[('german', german)]
# )

In [7]:
# #checking out  datas from our datasets
# for i, example in enumerate(german_validation_data):
#     print("german Text:", " ".join(example.german))
#     # print(example.german)
#     if i==3:
#         break

# print("___________________________________________") 
 
# for i, example in enumerate(english_validation_data):
#     print("english Text:", " ".join(example.english))
#     # print(example.english)
#     if i==3:
#         break    


In [8]:
# Read English and German files
with open('datasets/train/eng.en', 'r', encoding='utf-8') as f:
    train_english_sentences = [line.strip() for line in f.readlines()]

with open('datasets/train/ger.de', 'r', encoding='utf-8') as f:
    train_german_sentences = [line.strip() for line in f.readlines()]

with open('datasets/test/eng.en', 'r', encoding='utf-8') as f:
    test_english_sentences = [line.strip() for line in f.readlines()]

with open('datasets/test/ger.de', 'r', encoding='utf-8') as f:
    test_german_sentences = [line.strip() for line in f.readlines()]

with open('datasets/valid/eng.en', 'r', encoding='utf-8') as f:
    valid_english_sentences = [line.strip() for line in f.readlines()]

with open('datasets/valid/ger.de', 'r', encoding='utf-8') as f:
    valid_german_sentences = [line.strip() for line in f.readlines()]        


In [9]:
# Create pairs
train_data = list(zip(train_german_sentences,train_english_sentences))
valid_data = list(zip(valid_german_sentences,valid_english_sentences))
test_data = list(zip(test_german_sentences,test_english_sentences))



In [10]:
train_data[:5]

[('zwei männer betrachten etwas im garten',
  'two young guys with shaggy hair look at their hands while hanging out in the yard .'),
 ('die männer arbeiten an der seilbahn .',
  'several men in hard hats are operating a giant pulley system .'),
 ('ein mädchen im rosa kleid klettert in eine stall',
  'a child in a pink dress is climbing up a set of stairs in an entry way .'),
 ('ein mann auf einer leiter putzt ein fenster',
  'someone in a blue shirt and hat is standing on stair and leaning against a window .'),
 ('ein mann am herd füllt den teller eines zweiten mannes .',
  'two men , one in a gray shirt , one in a black shirt , standing near a stove .')]

In [11]:
#using field for structuring all the given datas into lower case, star of sentences and end of sentences

english = Field(tokenize=tok_english, lower=True, init_token="<sos>", eos_token="<eos>")
german = Field(tokenize=tok_german, lower=True, init_token="<sos>", eos_token="<eos>")

In [12]:
fields=[('german',german),('english',english)]


In [13]:

# Create train datasets
train_examples = [torchtext.data.Example.fromlist([pair[0], pair[1]], fields) for pair in train_data]
train_dataset = torchtext.data.Dataset(train_examples, fields)


# Create test datasets
test_examples = [torchtext.data.Example.fromlist([pair[0], pair[1]], fields) for pair in test_data]
test_dataset = torchtext.data.Dataset(test_examples, fields)



# Create valid datasets
valid_examples = [torchtext.data.Example.fromlist([pair[0], pair[1]], fields) for pair in valid_data]
valid_dataset = torchtext.data.Dataset(valid_examples, fields)



# # Build vocabulary
# engl.build_vocab(dataset, min_freq=2)
# TRG.build_vocab(dataset, min_freq=2)


In [14]:
#checking our datasets 
eg= train_dataset[0]
print(f"english:{eg.english}, german:{eg.german}")
eg2 = test_dataset[0]
print(f"english:{eg2.english}, german:{eg2.german}")



english:['two', 'young', 'guys', 'with', 'shaggy', 'hair', 'look', 'at', 'their', 'hands', 'while', 'hanging', 'out', 'in', 'the', 'yard', '.'], german:['zwei', 'männer', 'betrachten', 'etwas', 'im', 'garten']
english:['the', 'man', 'with', 'pierced', 'ears', 'is', 'wearing', 'glasses', 'and', 'an', 'orange', 'hat', '.'], german:['der', 'mann', 'trägt', 'eine', 'orange', 'wollmütze', '.']


In [15]:
#Building vocab for our training datas

english.build_vocab(train_dataset, max_size= 1000, min_freq=2)
german.build_vocab(train_dataset, max_size= 1000, min_freq=2)


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
#creating an encoder class

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, d):
        super(Encoder, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size= embedding_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(d)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d)

    def forward(self,x):
        embedding = self.dropout(self.embedding(x))
        output,(hidden,cell) = self.rnn(embedding)
        return hidden,cell

In [18]:
# creating a decoder class

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, output_size, num_layers,d):
        super(Decoder, self).__init__()

        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.dropout = nn.Dropout(d)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d)
        self.fc = nn.Linear(hidden_size, output_size)


    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell
    



       

In [19]:
#Now combining our encoder and decoder into seq2seq module

class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        
        self.encoder=encoder
        self.decoder=decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        #passing source through encoder
        hidden, cell = self.encoder(source)

     #After this grab the first <sos> of the target
        x= target[0]

        for t in range(1, target_len):
            #Now feed target and the data from hidden and cell  into the decoder
            output, hidden, cell = self.decoder(x,hidden, cell)

            #storing the predicted values 
            outputs[t]= output

            #now selecting the best guess among the result
            best_result = output.argmax(1)

            #using the teacher learning ratio for the best and smooth predictions 
            x = target[t] if random.random() < teacher_force_ratio else best_result

        return outputs    



In [20]:
# Model hyperparameters
load_model = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5


In [21]:
print(torch.__version__)
print(torch.cuda.current_device())

2.1.0
0


In [22]:
encoder_net = Encoder(input_size_encoder,hidden_size, encoder_embedding_size, num_layers,enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,hidden_size, decoder_embedding_size,output_size, num_layers,enc_dropout).to(device)
model = Seq2seq(encoder_net, decoder_net).to(device)

In [23]:
print(encoder_net)
print("")
print(decoder_net)
print("")
print(model)


Encoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(1004, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
)

Decoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(1004, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=1004, bias=True)
)

Seq2seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(1004, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(1004, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=1004, bias=True)
  )
)


In [24]:
# Training hyperparameters
num_epochs = 2
learning_rate = 0.001
batch_size = 64

In [25]:

#creating our iterator for taining 
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.english),
    device=device,
)


In [26]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [27]:
# Initializing the tensor board

writer= SummaryWriter()
step=0

In [32]:
from tqdm.notebook import tqdm


In [33]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in tqdm(range(num_epochs)):
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    print(device)
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.german.to(device)
        target = batch.english.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        
        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1



score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

  0%|          | 0/2 [00:00<?, ?it/s]

[Epoch 0 / 2]
=> Saving checkpoint
cuda
Translated example sentence: 
 ['kicking', 'goggles', 'goggles', 'goggles', 'goggles', 'goggles', 'about', 'lined', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles', 'that', 'goggles', 'goggles', 'goggles']
[Epoch 1 / 2]
=> Saving checkpoint
cuda
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'shirt', 'and', 'a', 'a', '<unk>', '<unk>', 'a', '<unk>', '<unk>', '.', '<eos>']


In [None]:
#Testing Our model
torch.load("my_checkpoint.pth.tar")
test_sentence = "sie ist grausam"
model.eval()
translated_sentence = translate_sentence(
    model, test_sentence, german, english, device, max_length=50
)

print(f"Translated example sentence: \n {translated_sentence}")

In [None]:
#calculating the bleu score
targets = []
outputs = []
for  example in tqdm(test_dataset.examples[1:100]):

      src = example.german
      trg = example.english
      # print(src)
      prediction = translate_sentence(model, src, german, english, device)
      # print(prediction[:-1])
      targets.append([trg])
      outputs.append(prediction)
print(bleu_score(outputs, targets))
