<a href="https://colab.research.google.com/github/Bhavya-2k03/Machine_Translation/blob/main/MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
DEVICE=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device(type='cuda', index=0)

In [None]:
!wget https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/eng_ita_v2.txt

--2023-12-19 17:05:34--  https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/eng_ita_v2.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7240475 (6.9M) [text/plain]
Saving to: ‘eng_ita_v2.txt’


2023-12-19 17:05:34 (162 MB/s) - ‘eng_ita_v2.txt’ saved [7240475/7240475]



In [None]:
import numpy as np
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().strip().split('\n')
    pairs = [[s for s in line.split(' -> ')] for line in lines]
    return pairs

In [None]:
file_path = 'eng_ita_v2.txt'
pairs = read_data(file_path)
len(pairs)

120746

In [None]:
def tokenize(sentence):
  return sentence.lower().split()

def build_vocab(pairs):
    eng_vocab = set()
    ita_vocab = set()
    for eng, ita in pairs:
        eng_vocab.update(tokenize(eng))
        ita_vocab.update(tokenize(ita))
    return eng_vocab, ita_vocab

english_vocab, italian_vocab = build_vocab(pairs)

eng_word2int = {word: i for i, word in enumerate(english_vocab)}
ita_word2int = {word: i for i, word in enumerate(italian_vocab)}

eng_int2word = {i: word for word, i in eng_word2int.items()}
ita_int2word = {i: word for word, i in ita_word2int.items()}

print('English vocabulary size:', len(english_vocab))
print('Italian vocabulary size:', len(italian_vocab))

English vocabulary size: 4997
Italian vocabulary size: 13673


In [None]:
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<EOS>"
SOS_TOKEN = "<SOS>"
UNK_TOKEN = "<UNK>"

def create_mappings(vocab):
    vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] + sorted(vocab)
    word2int = {word: i for i, word in enumerate(vocab)}
    int2word = {i: word for word, i in word2int.items()}
    return word2int, int2word

eng_word2int, eng_int2word = create_mappings(english_vocab)
ita_word2int, ita_int2word = create_mappings(italian_vocab)

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, eng_word2int, ita_word2int):
        self.pairs = pairs
        self.eng_word2int = eng_word2int
        self.ita_word2int = ita_word2int

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, ita = self.pairs[idx]
        eng_tensor = torch.tensor([self.eng_word2int[word] for word in tokenize(eng)]+[self.eng_word2int[EOS_TOKEN]], dtype=torch.long)
        ita_tensor = torch.tensor([self.ita_word2int[word] for word in tokenize(ita)]+[self.ita_word2int[EOS_TOKEN]], dtype=torch.long)
        return eng_tensor, ita_tensor

def collate_fn(batch):
    eng_batch, ita_batch = zip(*batch)
    eng_batch_padded = pad_sequence(eng_batch, batch_first=True, padding_value=eng_word2int[PAD_TOKEN])
    ita_batch_padded = pad_sequence(ita_batch, batch_first=True, padding_value=ita_word2int[PAD_TOKEN])
    return eng_batch_padded, ita_batch_padded

In [None]:
from torch.nn.utils.rnn import pad_sequence
translation_dataset = TranslationDataset(pairs, eng_word2int, ita_word2int)
batch_size = 64
translation_dataloader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True,  drop_last=True, collate_fn=collate_fn)

Translation samples:  120746
Translation batches:  1886


In [None]:
# for eng, ita in translation_dataloader:
#     print("Eng batch:",eng)
#     print("Ita batch:",ita)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,batch_first=True)

    def forward(self, x):
        x = torch.flip(x,[1]) #reversing
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(vocab_size, embed_size)
        self.lstm=nn.LSTM(embed_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc=nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x)
        out, (hidden, cell) = self.lstm(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

In [None]:
eng_vocab_size=len(eng_word2int)
ita_vocab_size=len(ita_word2int)
embed_size=256
hidden_size=512
num_layers=1
encoder=Encoder(eng_vocab_size,embed_size,hidden_size,num_layers).to(DEVICE)
decoder=Decoder(ita_vocab_size,embed_size,hidden_size,num_layers).to(DEVICE)

In [None]:
def translate(encoder, decoder, sentence, eng_word2int, ita_int2word, max_length=15):
    encoder.eval()
    decoder.eval()
    with torch.inference_mode():
        # Tokenize and encode the sentence
        input_tensor=torch.tensor([eng_word2int[word] for word in tokenize(sentence)]+[eng_word2int[EOS_TOKEN]], dtype=torch.long)
        input_tensor=input_tensor.view(1,-1).to(DEVICE)
        _, encoder_hidden, encoder_cell = encoder(input_tensor)
        decoder_input=torch.tensor([[eng_word2int[SOS_TOKEN]]], dtype=torch.long)
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        decoded_words=[]
        last_word=torch.tensor([[eng_word2int[SOS_TOKEN]]]).to(DEVICE)
        for di in range(max_length):
            logits,decoder_hidden, decoder_cell = decoder(last_word, decoder_hidden, decoder_cell)
            next_token=logits.argmax(dim=1)
            last_word=torch.tensor([[next_token]]).to(DEVICE)
            if next_token.item()==ita_word2int[EOS_TOKEN]:
                break
            else:
                decoded_words.append(ita_int2word.get(next_token.item()))

        return ' '.join(decoded_words)

In [None]:
import torch.optim as optim
import torch.nn as nn
import random

loss_fn = nn.CrossEntropyLoss(ignore_index=eng_word2int[PAD_TOKEN])
encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())
num_epochs = 10
encoder.train()
decoder.train()

for epoch in range(num_epochs):
    for i, (input_tensor, target_tensor) in enumerate(translation_dataloader):
        input_tensor, target_tensor = input_tensor.to(DEVICE), target_tensor.to(DEVICE)
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        target_length = target_tensor.size(1)
        _, encoder_hidden, encoder_cell = encoder(input_tensor)
        decoder_input = torch.full((batch_size, 1), eng_word2int[SOS_TOKEN], dtype=torch.long).to(DEVICE)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell
        random_word_index = random.randint(0, target_length - 1)
        loss = 0
        for di in range(target_length):
            logits, decoder_hidden, decoder_cell  = decoder(decoder_input, decoder_hidden, decoder_cell)
            loss += loss_fn(logits, target_tensor[:,di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        if i%100==0:
            print(f'Epoch {epoch}, Batch {i}, Loss: {loss.item() / target_length:.4f}')


Epoch 0, Batch 0, Loss: 9.5280
Epoch 0, Batch 100, Loss: 4.4034
Epoch 0, Batch 200, Loss: 4.1602
Epoch 0, Batch 300, Loss: 4.5719
Epoch 0, Batch 400, Loss: 3.5316
Epoch 0, Batch 500, Loss: 3.4764
Epoch 0, Batch 600, Loss: 3.1723
Epoch 0, Batch 700, Loss: 2.8174
Epoch 0, Batch 800, Loss: 2.9532
Epoch 0, Batch 900, Loss: 2.5940
Epoch 0, Batch 1000, Loss: 2.8760
Epoch 0, Batch 1100, Loss: 2.6314
Epoch 0, Batch 1200, Loss: 2.4217
Epoch 0, Batch 1300, Loss: 3.2909
Epoch 0, Batch 1400, Loss: 2.4033
Epoch 0, Batch 1500, Loss: 2.5604
Epoch 0, Batch 1600, Loss: 2.1422
Epoch 0, Batch 1700, Loss: 2.2601
Epoch 0, Batch 1800, Loss: 3.0945
Epoch 1, Batch 0, Loss: 0.7630
Epoch 1, Batch 100, Loss: 1.4685
Epoch 1, Batch 200, Loss: 1.6303
Epoch 1, Batch 300, Loss: 1.3961
Epoch 1, Batch 400, Loss: 1.3776
Epoch 1, Batch 500, Loss: 1.4368
Epoch 1, Batch 600, Loss: 1.2745
Epoch 1, Batch 700, Loss: 1.0998
Epoch 1, Batch 800, Loss: 1.1950
Epoch 1, Batch 900, Loss: 1.4213
Epoch 1, Batch 1000, Loss: 1.2337
Epoc

In [None]:
sentence = "tom said he would not come"
translated_sentence = translate(encoder, decoder, sentence, eng_word2int, ita_int2word)
print("Translated:", translated_sentence)

Translated: tom ha detto che non sarebbe venuto
