In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Creating Model

In [3]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_length, embed_dim, dropout=0.1):
        super().__init__()
        self.pos_embed = nn.Parameter(torch.randn(max_length, embed_dim) * 0.02)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        return self.dropout(X + self.pos_embed[:X.size(1)])

In [4]:
class MultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.h = num_heads
        self.d = embed_dim // num_heads
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, X):
        return X.view(X.size(0), X.size(1), self.h, self.d).transpose(1, 2)

    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        q = self.split_heads(self.q_proj(query))  # (B, h, Lq, d)
        k = self.split_heads(self.k_proj(key))  # (B, h, Lk, d)
        v = self.split_heads(self.v_proj(value))  # (B, h, Lv, d) with Lv=Lk
        scores = q @ k.transpose(2, 3) / self.d**0.5  # (B, h, Lq, Lk)

        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask, -torch.inf)  # (B, h, Lq, Lk)

        if key_padding_mask is not None:
            mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # (B, 1, 1, Lk)
            scores = scores.masked_fill(mask, -torch.inf)  # (B, h, Lq, Lk)
        
        weights = scores.softmax(dim=-1)  # (B, h, Lq, Lk)
        Z = self.dropout(weights) @ v  # (B, h, Lq, d)
        Z = Z.transpose(1, 2)  # (B, Lq, h, d)
        Z = Z.reshape(Z.size(0), Z.size(1), self.h * self.d)  # (B, Lq, h × d)
        return (self.out_proj(Z), weights)  # (B, Lq, h × d)

In [5]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        attn, _ = self.self_attn(src, src, src, attn_mask=src_mask,
                                 key_padding_mask=src_key_padding_mask)
        Z = self.norm1(src + self.dropout(attn))
        ff = self.dropout(self.linear2(self.dropout(self.linear1(Z).relu())))
        return self.norm2(Z + ff)

In [6]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout)
        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        attn1, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)
        Z = self.norm1(tgt + self.dropout(attn1))
        attn2, _ = self.multihead_attn(Z, memory, memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)
        Z = self.norm2(Z + self.dropout(attn2))
        ff = self.dropout(self.linear2(self.dropout(self.linear1(Z).relu())))
        return self.norm3(Z + ff)

In [7]:
class NmtTransformer(nn.Module):
    def __init__(self, vocab_size, max_length, embed_dim=512, pad_id=0,
                 num_heads=8, num_layers=6, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.pos_embed = PositionalEmbedding(max_length, embed_dim, dropout)
        self.transformer = nn.Transformer(embed_dim, num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, batch_first=True)
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, pair):
        src_embeds = self.pos_embed(self.embed(pair.src_token_ids))
        tgt_embeds = self.pos_embed(self.embed(pair.tgt_token_ids))
        src_pad_mask = ~pair.src_mask.bool()
        tgt_pad_mask = ~pair.tgt_mask.bool()
        size = [pair.tgt_token_ids.size(1)] * 2
        full_mask = torch.full(size, True, device=tgt_pad_mask.device)
        causal_mask = torch.triu(full_mask, diagonal=1)
        out_decoder = self.transformer(src_embeds, tgt_embeds,
                                       src_key_padding_mask=src_pad_mask,
                                       memory_key_padding_mask=src_pad_mask,
                                       tgt_mask=causal_mask, tgt_is_causal=True,
                                       tgt_key_padding_mask=tgt_pad_mask)
        return self.output(out_decoder).permute(0, 2, 1)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Preparing data

In [10]:
import requests 

def download_data(url, save_path, chunk_size=128):
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(save_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Successfully downloaded {save_path}")

download_data("https://download.pytorch.org/tutorial/data.zip", "data.zip")

from zipfile import ZipFile

extract_to_path = "/kaggle/working/dataset"

with ZipFile("/kaggle/working/data.zip", 'r') as zip_object:
    zip_object.extractall(path=extract_to_path)

Successfully downloaded data.zip


In [11]:
import pandas as pd

df = pd.read_csv("/kaggle/working/dataset/data/eng-fra.txt", sep='\t')
print(df)
print(df.head())
print(df.describe())

                                                      Go.  \
0                                                    Run!   
1                                                    Run!   
2                                                    Wow!   
3                                                   Fire!   
4                                                   Help!   
...                                                   ...   
135836  A carbon footprint is the amount of carbon dio...   
135837  Death is something that we're often discourage...   
135838  Since there are usually multiple websites on a...   
135839  If someone who doesn't know your background sa...   
135840  It may be impossible to get a completely error...   

                                                     Va !  
0                                                 Cours !  
1                                                Courez !  
2                                              Ça alors !  
3                          

In [12]:
import tokenizers

def train_eng_fra():  # a generator function to iterate over all training text
    for index, row in df.iterrows():
        yield row.iloc[0]
        yield row.iloc[1]

max_length = 500
vocab_size = 10_000
nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_length)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_fra(), nmt_tokenizer_trainer)






In [14]:
print(nmt_tokenizer.decode(nmt_tokenizer.encode("How are you?").ids))

How are you ?


In [47]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx, 0]
        tgt_text = self.df.iloc[idx, 1]
        tgt_text = f"<s> {tgt_text} </s>"
        return src_text, tgt_text
        # tgt_text = f"<s> {tgt_text} </s>"

        # src_encodings = nmt_tokenizer.encode(src_text)
        # tgt_encodings = nmt_tokenizer.encode(tgt_text)
        # inputs = NmtPair(torch.tensor(src_encodings.ids), torch.tensor(src_encodings.attention_mask), 
        #                  torch.tensor(tgt_encodings.ids[:-1]), torch.tensor(tgt_encodings.attention_mask[:-1]))
        # labels = torch.tensor(tgt_encodings.ids[1:])
        # return inputs, labels

train_dataset = TextDataset(df.iloc[:100000, :])
valid_dataset = TextDataset(df.iloc[100000:120000, :])
test_dataset = TextDataset(df.iloc[120000:, :])

In [49]:
from torch.utils.data import DataLoader

batch_size = 32

from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NmtPair(namedtuple("NmtPairBase", fields)):
    def to(self, device):
        return NmtPair(self.src_token_ids.to(device), self.src_mask.to(device),
                       self.tgt_token_ids.to(device), self.tgt_mask.to(device))

def collate_batch(batch, tokenizer = nmt_tokenizer):
    src_text_batch, tgt_text_batch = [], []
    for src_text, tgt_text in batch:
        src_text_batch.append(src_text)
        tgt_text_batch.append(tgt_text)
    src_batch_encodings = tokenizer.encode_batch(src_text_batch)
    tgt_batch_encodings = tokenizer.encode_batch(tgt_text_batch)

    src_ids = [encoding.ids for encoding in src_batch_encodings]
    tgt_ids = [encoding.ids[:-1] for encoding in tgt_batch_encodings]
    src_mask = [encoding.attention_mask for encoding in src_batch_encodings]
    tgt_mask = [encoding.attention_mask[:-1] for encoding in tgt_batch_encodings]
    label_ids = [encoding.ids[1:] for encoding in tgt_batch_encodings]
    
    inputs = NmtPair(
        torch.tensor(src_ids), 
        torch.tensor(src_mask), 
        torch.tensor(tgt_ids), 
        torch.tensor(tgt_mask)
    )
    labels = torch.tensor(label_ids)
    return inputs, labels

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_batch)
print(len(train_dataloader))
print(len(valid_dataloader))
print(len(test_dataloader))
# for data in train_dataloader:
#     print(data)
#     break

3125
625
496


# Training model

In [50]:
def train_epoch(dataloader, model, optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, label_tensor = data
        input_tensor, label_tensor = input_tensor.to(device), label_tensor.to(device)
        
        optimizer.zero_grad()
        
        pred = model(input_tensor)
        
        loss = criterion(pred, label_tensor)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [51]:
from torch.optim import Adam

nmt_tr_model = NmtTransformer(vocab_size, max_length, embed_dim=128, pad_id=0, num_heads=4, num_layers=2, dropout=0.1).to(device)
# Cross Entropy
# Input = (Batch_size,vocab_size, Seq_length)
# Label = (Batch_size, Seq_Length)
xentropy = nn.CrossEntropyLoss()
optimizer = Adam(nmt_tr_model.parameters(), lr = 0.001)

In [52]:
def train(train_dataloader, model, n_epochs, criterion, learning_rate=0.001):
    print_loss_total = 0  # Reset every print_every
    nmt_tr_model.train()
    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, model, optimizer, criterion)
        print_loss_total += loss

        if epoch % 2 == 0:
            print_loss_avg = print_loss_total / 2
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (epoch, epoch / n_epochs * 100, print_loss_avg))

train(train_dataloader, nmt_tr_model, 20, xentropy)

(2 10%) 1.5056
(4 20%) 0.8834
(6 30%) 0.7384
(8 40%) 0.6612
(10 50%) 0.6084
(12 60%) 0.5731
(14 70%) 0.5437
(16 80%) 0.5203
(18 90%) 0.5014
(20 100%) 0.4845


# Evaluation

In [56]:
def evaluate(model, sentence, tokenizer, max_length = 50):
    model.eval()

    src_encodings = tokenizer.encode(sentence)

    tgt_sentence = f"<s>"
    index = 0
    with torch.no_grad():
        for index in range(max_length):
            tgt_encodings = tokenizer.encode(tgt_sentence)
            inputs = NmtPair(
                torch.tensor(torch.tensor(src_encodings.ids).view(1, -1)), 
                torch.tensor(torch.tensor(src_encodings.attention_mask).view(1, -1)), 
                torch.tensor(torch.tensor(tgt_encodings.ids).view(1, -1)), 
                torch.tensor(torch.tensor(tgt_encodings.attention_mask).view(1, -1))
            )

            pred = model(inputs.to(device))
            pred_token_ids = pred.argmax(dim=1)  # find the best token IDs
            next_token_id = pred_token_ids[0, index]  # take the last token ID

            next_token = tokenizer.id_to_token(next_token_id)
            tgt_sentence += " " + next_token
            if next_token_id == 3:
                break

    return tgt_sentence

print(evaluate(nmt_tr_model, "I am good", nmt_tokenizer))

<s> Je suis bon . </s>


  torch.tensor(torch.tensor(src_encodings.ids).view(1, -1)),
  torch.tensor(torch.tensor(src_encodings.attention_mask).view(1, -1)),
  torch.tensor(torch.tensor(tgt_encodings.ids).view(1, -1)),
  torch.tensor(torch.tensor(tgt_encodings.attention_mask).view(1, -1))
