In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available.")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

# Transformer

## Multihead Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        if (d_model % num_heads != 0):
            raise ValueError('d_model (dimension of word embeddings) must be divisible by num_heads')
        
        # d_model refers to the dimension of the word vectors that we use throughout the model
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # trainable parameters for the attention
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # dot pdt and scale
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

## Position Wise Feed Forward Layer
This is a normal Feed Forward layer. The same network is applied to every postion (or word) in the sentence. <br /> <br />
It contains <br />
Input layer of shape = embedding dimension <br />
Output layer of shape = embedding dimension <br />
One hidden layer (no. of nodes = 2048 in the original paper)

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff) #input layer
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

## Postitional Encoding
No RNN used in Transformer. So we need to add sense of postion of word in a sentence <br />
$PE(pos, 2i) = sin(pos/10000^{2i/dmodel})$ <br />
$PE(pos, 2i+1) = cos(pos/10000^{2i/dmodel})$

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        # even postions (start from index 0, jump by 2)
        pe[:, 0::2] = torch.sin(position * div_term)
        
        #odd postitions (start from index 1, jump by 2)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)] # add the pos embedding with word embedding

## Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

## Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

## Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.max_seq_length = max_seq_length

    def generate_mask(self, src, tgt):
        src_mask = None
        tgt_mask = None
        if src != None:
            src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(device)
        if tgt != None:
            tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(device)
            seq_length = tgt.size(1)
            nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
            tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
            
        output = self.fc(dec_output)
        return output
    
    def inference(self, src):
        src_mask, _ = self.generate_mask(src, None)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # Initialize the decoder input with a start token
        decoder_input_init = torch.tensor([[1]]).to(src.device)
        decoder_input = decoder_input_init

        output_sequence = []
        for _ in range(self.max_seq_length):
            #_, tgt_mask = self.generate_mask(None, decoder_input)
            tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(decoder_input)))

            dec_output = tgt_embedded
            for dec_layer in self.decoder_layers:
                dec_output = dec_layer(dec_output, enc_output, src_mask, None)

            output = self.fc(dec_output)
            predicted_token = torch.argmax(output, dim=-1)

            #Append the predicted token to the decoder input
            decoder_input = torch.cat((decoder_input_init, predicted_token), dim=1)

        return predicted_token


# Data Loading

In [None]:
file_path = '/kaggle/input/fr-eng/fra.txt'

In [None]:
column_names = ['English', 'French']

df = pd.read_csv(file_path, delimiter='\t', usecols=[0, 1], names=column_names)

In [None]:
english_sentences = df['English'].tolist()
french_sentences = df['French'].tolist()

### Unknown token
Sometimes model may face unknown words, especially names of people, places etc. <br />
I add some examples to the dataset. <br />
(Here, I translate the unknown word as it is (i.e., name of people, places etc))

In [None]:
english_addn = [
    "<unk> is a good boy",
    "My favourite food is <unk>",
    "I live in <unk>",
    "Tom lives in <unk>",
    "<unk> lives in Kolkata",
    "<unk> is very sick and needs to go to a doctor",
    "<unk>",
    "Today I cooked <unk> and it was delicious"
]

french_addn = [
    "<unk> est un bon garçon",
    "Ma nourriture préféré est <unk>",
    "J'habite à <unk>",
    "Tom habite à <unk>",
    "<unk> habite à Kolkata",
    "<unk> est très malade est a besoin d'aller chez un médecin",
    "<unk>",
    "Aujourd'hui j'ai cuisiné <unk> et c'était délicieux"
]

In [None]:
english_sentences.extend(english_addn)
french_sentences.extend(french_addn)

In [None]:
print(f'{english_sentences[-1]}\n{french_sentences[-1]}')

In [None]:
def lower_and_preprocess(x):
    x = str.lower(x)
    x = x.replace('\u202f', ' ') # no break space with normal space
    x = x.replace('\xa0', ' ')
    x = x.replace('\u2009', ' ')
    x = x.replace("'", "' ") # words with apostophe are seperated into 2 words
    x = '<sos> ' + x + ' <eos>' 
    return x

In [None]:
english_sentences = list(map(lower_and_preprocess, english_sentences))
french_sentences = list(map(lower_and_preprocess, french_sentences))

In [None]:
english_vocab = {'<pad>': 0, '<sos>': 1, '<eos>':2, '<unk>': 3}
french_vocab = {'<pad>': 0, '<sos>': 1, '<eos>':2, '<unk>': 3}

In [None]:
def tokenizer(sentences, vocab):
    tokenized_list = []
    for sentence in sentences:
        tokens = sentence.strip().split() 
        encoded = [vocab.setdefault(token, len(vocab)) for token in tokens]
        tokenized_list.append(encoded)
    return tokenized_list

In [None]:
def tokenizer_inference(sentences, vocab):
    tokenized_list = []
    for sentence in sentences:
        tokens = sentence.strip().split() 
        encoded = [vocab.get(token) or vocab.get('<unk>') for token in tokens]
        tokenized_list.append(encoded)
    return tokenized_list

In [None]:
english_tokenized = tokenizer(english_sentences, english_vocab)
french_tokenized = tokenizer(french_sentences, french_vocab)

In [None]:
# just checking
print(f'{english_tokenized[-1]}\n{french_tokenized[-1]}')

In [None]:
# Step 2: Padding
english_padded = pad_sequence([torch.tensor(seq) for seq in english_tokenized], batch_first=True)
french_padded = pad_sequence([torch.tensor(seq) for seq in french_tokenized], batch_first=True)

In [None]:
english_vocab_rev = {value: key for key, value in english_vocab.items()}
french_vocab_rev = {value: key for key, value in french_vocab.items()}

In [None]:
# Step 3: Dataset Creation
class TranslationDataset(Dataset):
    def __init__(self, english_data, french_data):
        self.english_data = english_data
        self.french_data = french_data

    def __len__(self):
        return len(self.english_data)

    def __getitem__(self, index):
        return {
            'input': self.english_data[index],
            'target': self.french_data[index]
        }


In [None]:
# Step 4: Data Loaders
batch_size = 32
dataset = TranslationDataset(english_padded, french_padded)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
size_eng = len(english_vocab)
size_fr = len(french_vocab)

In [None]:
print(f'{size_eng},\n{size_fr}')

In [None]:
french_padded.shape

In [None]:
english_padded.shape

In [None]:
src_vocab_size = size_eng
tgt_vocab_size = size_fr
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 70
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to('cuda')

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
transformer.train()

for epoch in range(2):
    batch_cnt = 0
    tot_loss = 0
    
    for batch in data_loader:
        batch_cnt += 1
        input_batch = batch['input'].to('cuda')  # Tensor of shape (batch_size, max_english_length)
        target_batch = batch['target'].to('cuda')  # Tensor of shape (batch_size, max_french_length)
        # Use the batches for training your Transformer model
        optimizer.zero_grad()
        output = transformer(input_batch, target_batch[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), target_batch[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        
        tot_loss += loss.item()
        if (batch_cnt % 500 == 0):
            print(f'\tEpoch: {epoch+1} Batch: {batch_cnt}, Loss: {loss.item()}')
            
    print(f"Epoch: {epoch+1}, Loss: {tot_loss/batch_cnt}")

In [None]:
path = '/kaggle/working/transformer_state.pt'

In [None]:
torch.save(transformer.state_dict(), path)

In [None]:
#transformer.load_state_dict(torch.load(path))

In [None]:
res = transformer(english_padded[90003:90004].to(device), french_padded[90003:90004].to(device))

In [None]:
res.shape

In [None]:
numerical_res = torch.argmax(res, axis=2)

In [None]:
numerical_res.shape

In [None]:
numerical_res

In [None]:
def get_sentence(vec, mapping_dict):
    res = ''
    for item in vec:
#         print(item)
        res_tmp = mapping_dict[item.item()]
        if (res_tmp == '<sos>'):
            continue
        
        if (res_tmp == '<eos>'):
            break
        res += ' ' + res_tmp
    return res

In [None]:
get_sentence(numerical_res[0].to('cpu'), french_vocab_rev)

In [None]:
french_vocab_rev[83]

In [None]:
get_sentence(english_padded[90003].to('cpu'), english_vocab_rev)

In [None]:
get_sentence(french_padded[90003].to('cpu'), french_vocab_rev)