<a href="https://colab.research.google.com/github/Busola181/NeuralNetworkBasedLanguageTranslationTool/blob/main/Neural_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

♈IMPORTING DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**♒

In [None]:
API_KEY_PATH ="/content/drive/MyDrive/kaggle.json"

!mkdir -p ~/.kaggle
!cp $API_KEY_PATH ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d devicharith/language-translation-englishfrench

RESOURCES_PATH="/content/language-translation-englishfrench.zip"
!cp $RESOURCES_PATH.
!unzip /content/language-translation-englishfrench.zip -d /content/

Dataset URL: https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench
License(s): CC0-1.0
Downloading language-translation-englishfrench.zip to /content
 85% 3.00M/3.51M [00:00<00:00, 5.46MB/s]
100% 3.51M/3.51M [00:00<00:00, 5.28MB/s]
cp: missing destination file operand after '.'
Try 'cp --help' for more information.
Archive:  /content/language-translation-englishfrench.zip
  inflating: /content/eng_-french.csv  


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader, Dataset
from collections import Counter
import re
import math
import nltk
import string
from unicodedata import normalize
from nltk.tokenize import word_tokenize


nltk.data.path.append('/usr/local/share/nltk_data')
nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
dataset_path = '/content/eng_-french.csv'
df = pd.read_csv(dataset_path)
print(df.columns)

Index(['English words/sentences', 'French words/sentences'], dtype='object')


In [None]:
punctuations = set(filter(lambda char: char in string.punctuation, dataset_path))
print(punctuations)

{'_', '.', '/', '-'}


In [None]:
def load_doc(dataset_path, column_path):
    text = df[column_path].str.cat(sep = '')
    words = text.split()
    print(words[:100])

load_doc(dataset_path, 'English words/sentences')


['Hi.Run!Run!Who?Wow!Fire!Help!Jump.Stop!Stop!Stop!Wait!Wait!Go', 'on.Go', 'on.Go', 'on.Hello!Hello!I', 'see.I', 'try.I', 'won!I', 'won!I', 'won.Oh', 'no!Attack!Attack!Cheers!Cheers!Cheers!Cheers!Get', 'up.Go', 'now.Go', 'now.Go', 'now.Got', 'it!Got', 'it!Got', 'it?Got', 'it?Got', 'it?Hop', 'in.Hop', 'in.Hug', 'me.Hug', 'me.I', 'fell.I', 'fell.I', 'know.I', 'left.I', 'left.I', 'lied.I', 'lost.I', "paid.I'm", "19.I'm", "OK.I'm", 'OK.Listen.No', 'way!No', 'way!No', 'way!No', 'way!No', 'way!No', 'way!No', 'way!No', 'way!No', 'way!Really?Really?Really?Thanks.We', 'try.We', 'won.We', 'won.We', 'won.We', 'won.Ask', 'Tom.Awesome!Be', 'calm.Be', 'calm.Be', 'calm.Be', 'cool.Be', 'fair.Be', 'fair.Be', 'fair.Be', 'fair.Be', 'fair.Be', 'fair.Be', 'kind.Be', 'nice.Be', 'nice.Be', 'nice.Be', 'nice.Be', 'nice.Be', 'nice.Beat', 'it.Call', 'me.Call', 'me.Call', 'us.Call', 'us.Come', 'in.Come', 'in.Come', 'in.Come', 'in.Come', 'on!Come', 'on.Come', 'on.Come', 'on.Drop', 'it!Drop', 'it!Drop', 'it!Drop', 

In [None]:
def load_doc(dataset_path):
    df = pd.read_csv(dataset_path)
    return df

def preprocess_text(text):
    text = normalize('NFD', text).encode('ascii', 'ignore').decode('UTF-8')
    text = text.lower()
    text = re.sub(r"[_./-]", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    return tokens

def tokenize_pairs(df):
    df['English tokens'] = df['English words/sentences'].apply(preprocess_text)
    df['French tokens'] = df['French words/sentences'].apply(preprocess_text)
    return df

df = load_doc(dataset_path)

df_tokenized = tokenize_pairs(df)

print(df_tokenized[['English tokens', 'French tokens']].head(2))


  English tokens French tokens
0           [hi]       [salut]
1          [run]       [cours]


In [None]:
def create_vocab(english_tokens, french_tokens, min_freq=1):

    english_flat = [word for sentence in english_tokens for word in sentence]
    french_flat = [word for sentence in french_tokens for word in sentence]

    english_freq = Counter(english_flat)
    french_freq = Counter(french_flat)

    english_vocab = {word: idx + 4 for idx, (word, _) in enumerate(english_freq.items()) if english_freq[word] >= min_freq}
    french_vocab = {word: idx + 4 for idx, (word, _) in enumerate(french_freq.items()) if french_freq[word] >= min_freq}

    special_tokens = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
    for token in special_tokens:
        english_vocab[token] = len(english_vocab) + 1
        french_vocab[token] = len(french_vocab) + 1

    return english_vocab, french_vocab

def text_to_tokens(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text]

def pad_sequences(sequences, max_len, pad_token_idx):
    return [seq + [pad_token_idx] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

english_tokens = df_tokenized['English tokens'].tolist()
french_tokens = df_tokenized['French tokens'].tolist()

english_vocab, french_vocab = create_vocab(english_tokens, french_tokens)


# Convert text to tokens using the vocabulary
def text_to_tokens(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text]

# Add padding to sequences to make them of equal length
def pad_sequences(sequences, max_len, pad_token_idx):
    return [seq + [pad_token_idx] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

# Assuming you have tokenized pairs (this was done earlier with `tokenize_pairs` function)
df_tokenized = tokenize_pairs(df)  # This already gives you tokenized sentences

# Now you can directly pass the tokenized sentences to create vocab
english_tokens = df_tokenized['English tokens'].tolist()
french_tokens = df_tokenized['French tokens'].tolist()

# Create vocabularies
english_vocab, french_vocab = create_vocab(english_tokens, french_tokens)

# Example of converting a sentence to tokens using the vocabulary
sample_english = df_tokenized['English tokens'].iloc[0]
sample_french = df_tokenized['French tokens'].iloc[0]
english_tokenized = text_to_tokens(sample_english, english_vocab)
french_tokenized = text_to_tokens(sample_french, french_vocab)

eng_input_tensor = torch.tensor(english_tokenized, dtype = torch.long)
fren_target_tensor = torch.tensor(french_tokenized, dtype = torch.long)

print(eng_input_tensor.shape)
print(fren_target_tensor.shape)

# eng_input_tensor = sorted(set)


torch.Size([1])
torch.Size([1])


In [None]:
print(english_vocab)



In [None]:
print(french_vocab)

{'salut': 4, 'cours': 5, 'courez': 6, 'qui': 7, 'ca': 8, 'alors': 9, 'au': 10, 'feu': 11, 'a': 12, 'laide': 13, 'saute': 14, 'suffit': 15, 'stop': 16, 'arrete': 17, 'toi': 18, 'attends': 19, 'attendez': 20, 'poursuis': 21, 'continuez': 22, 'poursuivez': 23, 'bonjour': 24, 'je': 25, 'comprends': 26, 'jessaye': 27, 'jai': 28, 'gagne': 29, 'lai': 30, 'emporte': 31, 'oh': 32, 'non': 33, 'attaque': 34, 'attaquez': 35, 'sante': 36, 'votre': 37, 'merci': 38, 'tchin': 39, 'leve': 40, 'va': 41, 'maintenant': 42, 'allez': 43, 'y': 44, 'vas': 45, 'pige': 46, 'compris': 47, 'tas': 48, 'capte': 49, 'monte': 50, 'montez': 51, 'serre': 52, 'moi': 53, 'dans': 54, 'tes': 55, 'bras': 56, 'serrez': 57, 'vos': 58, 'suis': 59, 'tombee': 60, 'tombe': 61, 'sais': 62, 'parti': 63, 'partie': 64, 'menti': 65, 'perdu': 66, 'paye': 67, 'ans': 68, 'vais': 69, 'bien': 70, 'ecoutez': 71, 'cest': 72, 'pas': 73, 'possible': 74, 'impossible': 75, 'en': 76, 'aucun': 77, 'cas': 78, 'sans': 79, 'facons': 80, 'hors': 81, '

In [None]:
mod_vec_dim = 512
vocab_size = len(english_vocab)
fre_vocab_size = len(french_vocab)

In [None]:
class input_Embedding(nn.Module):
    def __init__(self, mod_vec_dim, vocab_size):
        super().__init__()
        self.mod_vec_dim = mod_vec_dim
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, mod_vec_dim)

    def forward(self, x):
        return self.embed(x) * math.sqrt(self.mod_vec_dim)

class position_encoding(nn.Module):
    def __init__(self, mod_vec_dim, seq_len):
        super().__init__()
        self.mod_vec_dim = mod_vec_dim
        self.seq_len = seq_len
        position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, mod_vec_dim, 2).float() * -math.log(10000)/ mod_vec_dim)
        pe = torch.zeros(mod_vec_dim, seq_len)
        pe[:, 0:2] = torch.sin(position * div_term)
        pe[:, 1::2]= torch.cos(position * div_term)

        self.register_buffer = ('pe', pe.unsqueeze(0))

        self.dropout = nn.Dropout(p= 0.2)



    def forward(self, x):
        return  x + (self.pe[:, :x.size(1), :]).requires_grad_(False)
        return self.dropout(x)

# embed_dim == model_vec_dim
# For single head
class self_attention(nn.Module):
    def __init__(self, query, key, value, head_size):
        super(self_attention, self).__init__()
        self.query = nn.Linear(embed_dim, head_size bias =False)
        self.key = nn.Linear(embed_dim, head_size, bias =False)
        self.value = nn.Linear(embed_dim, head_size, bias =False)
        self.head_size = head_size
        self.sofmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        attention_scores = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.head_size)
        attention_probs = self.softmax(attention_scores)

        out = torch.matmul(attention_probs, value)
        out = self.dropout(out)

        return out


# For Multi-head

class Multihead_Attention(nn.Module):
    def __init__(self, embed_size, num_heads, mask):
        super(Multihead_Attention, self).__init__()
        self.embed_size = embed_size
        self.head_size = head_size
        self.num_head = num_head
        self.head_dim = embed_size // no_of_head

        assert self.head_dim * num_head == embed_size

        self.w_q = nn.Linear(embed_size, embed_size)
        self.w_k = nn.Linear(embed_size, embed_size)
        self.w_v = nn.Linear(embed_size, embed_size)

        self.fc_out = nn.Linear(embed_size, embed_Size)
        self.softmax = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(p=0.2)


    def forward(self, query, key, value, mask=None):
        N = query.shape[0]
        seq_len = query.shape[1]

        Q = self.w_q(query).view(N, seq_len, self.num_head, self.head_dim).transpose(1, 2)
        K = self.w_k(key).view(N, seq_len, self.num_head, self.head_dim).transpose(1, 2)
        V = self.w_v(value).view(N, seq_len, self.num_head, self.head_dim).transpose(1, 2)
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)/ math.sqrt(self.head_dim))

        if masked is not None:
            attention_scores = attention_scores.masked_filled(mask == 0, float('-1e20'))

        attention_probs = self.softmax(attention_scores)
        out = torch.matmul(attention_probs, V)

        out = out.transpose(1, 2).contigous().view(N, seq_len, self.embed_size)
        out = self.fc_out(out)

        return out



In [None]:
class TransformerEncoder(nn.Module):


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, english_tokens, french_tokens, english_vocab, french_vocab):
        self.english_tokens = english_tokens
        self.french_tokens = french_tokens
        self.english_vocab = english_vocab
        self.french_vocab = french_vocab

    def __len__(self):
        return len(self.english_tokens)

    def __getitem__(self, idx):
        english_sentence = self.english_tokens[idx]
        french_sentence = self.french_tokens[idx]

        english_indices = text_to_tokens(english_sentence, self.english_vocab)
        french_indices = text_to_tokens(french_sentence, self.french_vocab)

        return torch.tensor(english_indices), torch.tensor(french_indices)

    #  Collate function to pad sequences to the length of the longest in the batch
    def collate_fn(batch, pad_token_idx=0):
        english_batch, french_batch = zip(*batch)

        english_max_len = max([len(sentence) for sentence in english_batch])
        french_max_len = max([len(sentence) for sentence in french_batch])

        # Pad all sentences to max length in the batch
        english_batch_padded = pad_sequences(english_batch, english_max_len, pad_token_idx)
        french_batch_padded = pad_sequences(french_batch, french_max_len, pad_token_idx)

        # Convert lists to tensors
        english_batch_padded = torch.tensor(english_batch_padded)
        french_batch_padded = torch.tensor(french_batch_padded)

        return english_batch_padded, french_batch_padded

    # Example data
    english_tokens = df_tokenized['English tokens'].tolist()
    french_tokens = df_tokenized['French tokens'].tolist()

    # Create dataset
    dataset = TranslationDataset(english_tokens, french_tokens, english_vocab, french_vocab)

    # Create dataloader
    batch_size = 32
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Example: iterating through a batch
    for batch in dataloader:
        english_batch, french_batch = batch
        print(english_batch.shape)  # Shape: [batch_size, max_len_english]
        print(french_batch.shape)   # Shape: [batch_size, max_len_french]
        break


In [None]:
batch_size = 64
embed_dim = 512


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming `model`, `data_loader`, `optimizer`, and `criterion` (e.g., nn.CrossEntropyLoss()) are predefined.

# Training loop
num_epochs = 10  # Number of epochs or training iterations

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0  # Track total loss for the epoch

    for src, tgt in data_loader:  # Loop over batches of data
        # Prepare inputs and target for teacher forcing
        src = src.permute(1, 0)  # Reshape to (sequence_length, batch_size)
        tgt_input = tgt[:, :-1].permute(1, 0)  # Offset for target input
        tgt_output = tgt[:, 1:].permute(1, 0)  # Offset for target output

        optimizer.zero_grad()  # Clear previous gradients

        # Forward pass: Generate predictions
        output = model(src, tgt_input)
        output = output.view(-1, output.size(-1))  # Flatten for loss computation
        tgt_output = tgt_output.contiguous().view(-1)  # Flatten target sequence

        # Calculate loss
        loss = criterion(output, tgt_output)

        # Backward pass: Compute gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Accumulate loss for tracking
        total_loss += loss.item()

    # Print average loss for the epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}")
