<a href="https://colab.research.google.com/github/AnujKrishnaPhuyal/NLP/blob/main/Seq2seq_language_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Using cached torchtext-0.6.0-py3-none-any.whl (64 kB)
Collecting sentencepiece (from torchtext==0.6.0)
  Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.16.0
    Uninstalling torchtext-0.16.0:
      Successfully uninstalled torchtext-0.16.0
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
import spacy
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torchtext.data import Dataset
from torchtext.data.metrics import bleu_score



In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


2024-01-16 14:51:40.793596: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-16 14:51:40.793656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-16 14:51:40.797453: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting de-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.6.0/de_core_news_sm-3.6.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully insta

In [None]:
# Load spaCy models for German and English
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [None]:
# making tokenizer function

text = "I am a handsome guy"

def tok_english(text):
     return [tok.text for tok in spacy_eng.tokenizer(text)]

def tok_german(text):
     return [tok.text for tok in spacy_ger.tokenizer(text)]

In [None]:
# Read English and German files
with open('train/eng.en', 'r', encoding='utf-8') as f:
    train_english_sentences = [line.strip() for line in f.readlines()]

with open('train/ger.de', 'r', encoding='utf-8') as f:
    train_german_sentences = [line.strip() for line in f.readlines()]

In [None]:
train_data = list(zip(train_german_sentences,train_english_sentences))


In [None]:
train_data

[('zwei männer betrachten etwas im garten',
  'two young guys with shaggy hair look at their hands while hanging out in the yard .'),
 ('die männer arbeiten an der seilbahn .',
  'several men in hard hats are operating a giant pulley system .'),
 ('ein mädchen im rosa kleid klettert in eine stall',
  'a child in a pink dress is climbing up a set of stairs in an entry way .'),
 ('ein mann auf einer leiter putzt ein fenster',
  'someone in a blue shirt and hat is standing on stair and leaning against a window .'),
 ('ein mann am herd füllt den teller eines zweiten mannes .',
  'two men , one in a gray shirt , one in a black shirt , standing near a stove .'),
 ('ein maskenbildner bearbeitet das kostüm eines gitarristen .',
  'two people in the photo are playing the guitar and the other is poking at him .'),
 ('ein junger mann hält eine groeinße plüschfigur .',
  'a man sits in a chair while holding a large stuffed animal of a lion .'),
 ('eine frau im blauen shirt telefoniert beim rollsch

In [None]:
#using field for structuring all the given datas into lower case, star of sentences and end of sentences

english = Field(tokenize=tok_english, lower=True, init_token="<sos>", eos_token="<eos>")
german = Field(tokenize=tok_german, lower=True, init_token="<sos>", eos_token="<eos>")

In [None]:
fields=[('german',german),('english',english)]


In [None]:

# Create train datasets
train_examples = [torchtext.data.Example.fromlist([pair[0], pair[1]], fields) for pair in train_data]
train_dataset = torchtext.data.Dataset(train_examples, fields)


In [None]:
#checking our datasets
eg= train_dataset[0]
print(f"english:{eg.english}, german:{eg.german}")


english:['two', 'young', 'guys', 'with', 'shaggy', 'hair', 'look', 'at', 'their', 'hands', 'while', 'hanging', 'out', 'in', 'the', 'yard', '.'], german:['zwei', 'männer', 'betrachten', 'etwas', 'im', 'garten']


In [None]:
#Building vocab for our training datas

english.build_vocab(train_dataset, max_size= 10000, min_freq=2)
german.build_vocab(train_dataset, max_size= 10000, min_freq=2)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
#creating an encoder class

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, d):
        super(Encoder, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size= embedding_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(d)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d)

    def forward(self,x):
        embedding = self.dropout(self.embedding(x))
        output,(hidden,cell) = self.rnn(embedding)
        return hidden,cell

In [None]:
# creating a decoder class

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, output_size, num_layers,d):
        super(Decoder, self).__init__()

        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.dropout = nn.Dropout(d)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d)
        self.fc = nn.Linear(hidden_size, output_size)


    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell






In [None]:
#Now combining our encoder and decoder into seq2seq module

class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()

        self.encoder=encoder
        self.decoder=decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        #passing source through encoder
        hidden, cell = self.encoder(source)

     #After this grab the first <sos> of the target
        x= target[0]

        for t in range(1, target_len):
            #Now feed target and the data from hidden and cell  into the decoder
            output, hidden, cell = self.decoder(x,hidden, cell)

            #storing the predicted values
            outputs[t]= output

            #now selecting the best guess among the result
            best_result = output.argmax(1)

            #using the teacher learning ratio for the best and smooth predictions
            x = target[t] if random.random() < teacher_force_ratio else best_result

        return outputs



In [None]:
# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5


In [None]:
encoder_net = Encoder(input_size_encoder,hidden_size, encoder_embedding_size, num_layers,enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,hidden_size, decoder_embedding_size,output_size, num_layers,enc_dropout).to(device)
model = Seq2seq(encoder_net, decoder_net).to(device)

In [None]:
print(encoder_net)
print("")
print(decoder_net)
print("")
print(model)


Encoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(5714, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
)

Decoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(7294, 300)
  (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=7294, bias=True)
)

Seq2seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5714, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7294, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=7294, bias=True)
  )
)


In [None]:
# Training hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64

In [None]:

#creating our iterator for taining
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, train_dataset, train_dataset),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.english),
    device=device,
)


In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [None]:

writer= SummaryWriter()
step=0

In [None]:
from tqdm.notebook import tqdm


In [None]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = "Ein dicker Mann stürzte die Klippe hinunter"
for epoch in tqdm(range(num_epochs)):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if epoch%9==0:
      checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict(),"epoch":epoch}
      print("===> saving model")
      torch.save(checkpoint,"/content/my_checkpoint.pth.tar")
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.german.to(device)
        target = batch.english.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1



# score = bleu(train_dataset[1:100], model, german, english, device)
# print(f"Bleu score {score*100:.2f}")

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch 0 / 20]
===> saving model
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'jacket', 'stands', 'on', 'deep', 'snow', 'covered', 'the', 'ocean', '.', '<eos>']
[Epoch 1 / 20]
Translated example sentence: 
 ['a', 'man', 'is', 'a', 'black', 'hat', 'and', 'black', 'pants', 'is', 'standing', 'on', 'a', 'rock', '.', '<eos>']
[Epoch 2 / 20]
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'jacket', 'is', 'standing', 'on', 'a', 'rock', 'looking', 'at', 'the', 'ocean', '.', '<eos>']
[Epoch 3 / 20]
Translated example sentence: 
 ['a', 'climber', 'is', 'a', 'red', 'jacket', 'stands', 'on', 'deep', 'snow', '.', '<eos>']
[Epoch 4 / 20]
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'jacket', 'stands', 'on', 'deep', 'snow', '.', '<eos>']
[Epoch 5 / 20]
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'jacket', 'and', 'black', 'pants', 'is', 'on', 'a', 'rocky', '.', '<eos>']
[Epoch 6 / 20]
Translated example sentence: 
 ['a', 'man', 

LOADING THE MODEL FOR THE TESTIN PURPOSE AND CHECKING THE BLEU SCORE

In [None]:
torch.load("/content/my_checkpoint.pth.tar")
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch']

CODE TO LOAD THE CHECKPOINT AND CONTINUE TRAINING THE *MODEL*

In [None]:

# sentence = "Gordan Ramsay ist der Spitzenkoch auf der ganzen Welt"
for epoch in tqdm(range(start_epoch, 30)):
    print(f"[Epoch {epoch} / 30]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict(),"epoch":epoch}
    print("===> saving model")
    torch.save(checkpoint,"/content/my_checkpoint.pth.tar")
    print(device)
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.german.to(device)
        target = batch.english.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

  0%|          | 0/12 [00:00<?, ?it/s]

[Epoch 18 / 30]
===> saving model
cuda
Translated example sentence: 
 ['an', 'emergency', 'man', 'in', 'a', 'ivory', 'colored', 'coat', 'is', 'leaning', 'against', 'a', '<unk>', 'and', 'cleaning', 'a', 'drink', '.', '<eos>']
[Epoch 19 / 30]
===> saving model
cuda
Translated example sentence: 
 ['an', 'emergency', 'crew', 'caring', 'for', '<unk>', 'out', 'of', 'the', 'podium', '.', '<eos>']
[Epoch 20 / 30]
===> saving model
cuda
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'shirt', 'places', 'a', 'piece', 'of', 'equipment', '.', '<eos>']
[Epoch 21 / 30]
===> saving model
cuda
Translated example sentence: 
 ['a', 'emergency', 'worker', 'is', 'a', 'moment', 'in', 'wipe', 'his', 'face', '.', '<eos>']
[Epoch 22 / 30]
===> saving model
cuda
Translated example sentence: 
 ['a', 'man', 'in', 'a', 'black', 'shirt', 'and', 'black', 'hat', 'is', 'holding', 'up', 'a', 'piece', 'of', 'paper', '.', '<eos>']
[Epoch 23 / 30]
===> saving model
cuda
Translated example sentence: 
 ['an

TESTING OUR MODEL WORKING

In [None]:
torch.load("/content/my_checkpoint.pth.tar")
test_sentence = "sie ist grausam"
model.eval()
translated_sentence = translate_sentence(
    model, test_sentence, german, english, device, max_length=50
)

print(f"Translated example sentence: \n {translated_sentence}")

Translated example sentence: 
 ['near', 'a', '<unk>', 'wave', '.', '<eos>']


BLEU SCORE


In [None]:
targets = []
outputs = []
for  example in tqdm(train_dataset.examples[1:100]):

      src = example.german
      trg = example.english
      # print(src)
      prediction = translate_sentence(model, src, german, english, device)
      # print(prediction[:-1])
      targets.append([trg])
      outputs.append(prediction)
print(bleu_score(outputs, targets))


  0%|          | 0/99 [00:00<?, ?it/s]

0.0586862635484308
