In [62]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Creating Model

In [63]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [64]:
# import torch.nn as nn

# class Attention(nn.Module):
#     def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
#         super().__init__()
#         self.attn_fc = nn.Linear((encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim)
#         self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

#     def forward(self, decoder_hidden, encoder_outputs):
#         # hidden = [batch size, decoder hidden dim]
#         # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
#         batch_size = encoder_outputs.shape[1]
#         src_length = encoder_outputs.shape[0]
#         # repeat decoder hidden state src_length times
#         hidden = decoder_hidden.unsqueeze(1).repeat(1, src_length, 1)
#         encoder_outputs = encoder_outputs.permute(1, 0, 2)
#         # hidden = [batch size, src length, decoder hidden dim]
#         # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
#         energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
#         # energy = [batch size, src length, decoder hidden dim]
#         attention = self.v_fc(energy).squeeze(2)
#         # attention = [batch size, src length]
#         return torch.softmax(attention, dim=1)

In [78]:
class EncoderDecoderRNNAttention(nn.Module):
    def __init__(self, vocab_size, encoder_embed_size, encoder_hidden_size, encoder_dropout_p, decoder_embed_size, decoder_hidden_size):
        super(EncoderDecoderRNNAttention, self).__init__()
        self.encoder_embedding = nn.Embedding(vocab_size, encoder_embed_size)
        self.encoder_rnn = nn.GRU(encoder_embed_size, encoder_hidden_size, batch_first=True) 
        self.dropout = nn.Dropout(encoder_dropout_p)

        self.decoder_embedding = nn.Embedding(vocab_size, decoder_embed_size)
        self.decoder_rnn = nn.GRU(decoder_embed_size, decoder_hidden_size, batch_first=True)
        self.decoder_full = nn.Linear(2 * decoder_hidden_size, vocab_size)
        
    def forward(self, src, target):
        encoder_output = self.encoder_embedding(src)
        encoder_output = self.dropout(encoder_output)
        encoder_outputs, encoder_hidden = self.encoder_rnn(encoder_output) #output = (N,Seq_Length,H_enc), hidden = (1,N,H_enc)
            
        decoder_output = self.decoder_embedding(target)
        decoder_outputs, _ = self.decoder_rnn(decoder_output, encoder_hidden)

        attn_output = self.attention(query=decoder_outputs, key=encoder_outputs, value=encoder_outputs)
        combined_output = torch.cat((attn_output, decoder_outputs), dim=-1)
        
        return self.decoder_full(combined_output).permute(0, 2, 1)

    def attention(self, query, key, value):
        scores = query @ key.transpose(1, 2)  # [B, Lq, D] @ [B, D, Lk]= [B, Lq, Lk]
        weights = torch.softmax(scores, dim=-1)  # [B, Lq, Lk]
        return weights @ value  # [B, Lq, Lk] @ [B, Lk, Dv] = [B, Lq, Dv]
        

In [66]:
# Cross Entropy
# Input = (Batch_size,vocab_size, Seq_length)
# Label = (Batch_size, Seq_Length)
xentropy = nn.CrossEntropyLoss()

# Preparing dataset

In [67]:
from torch.utils.data import DataLoader

batch_size = 32

In [68]:
import requests 

def download_data(url, save_path, chunk_size=128):
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(save_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Successfully downloaded {save_path}")

download_data("https://download.pytorch.org/tutorial/data.zip", "data.zip")

Successfully downloaded data.zip


In [69]:
from zipfile import ZipFile

extract_to_path = "/kaggle/working/datase_final"

with ZipFile("/kaggle/working/data.zip", 'r') as zip_object:
    zip_object.extractall(path=extract_to_path)

In [70]:
import pandas as pd

df = pd.read_csv("/kaggle/working/datase_final/data/eng-fra.txt", sep='\t')
print(df)
print(df.head())
print(df.describe())

                                                      Go.  \
0                                                    Run!   
1                                                    Run!   
2                                                    Wow!   
3                                                   Fire!   
4                                                   Help!   
...                                                   ...   
135836  A carbon footprint is the amount of carbon dio...   
135837  Death is something that we're often discourage...   
135838  Since there are usually multiple websites on a...   
135839  If someone who doesn't know your background sa...   
135840  It may be impossible to get a completely error...   

                                                     Va !  
0                                                 Cours !  
1                                                Courez !  
2                                              Ça alors !  
3                          

In [71]:
import tokenizers

def train_eng_fra():  # a generator function to iterate over all training text
    for index, row in df.iterrows():
        yield row.iloc[0]
        yield row.iloc[1]

max_length = 500
vocab_size = 10_000
nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_length)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_fra(), nmt_tokenizer_trainer)






In [72]:
print(nmt_tokenizer.decode(nmt_tokenizer.encode("I like soccer").ids))

I like soccer


In [73]:
from torch.utils.data import Dataset
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NmtPair(namedtuple("NmtPairBase", fields)):
    def to(self, device):
        return NmtPair(self.src_token_ids.to(device), self.src_mask.to(device),
                       self.tgt_token_ids.to(device), self.tgt_mask.to(device))

seq_length = 40

class TextDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx, 0]
        tgt_text = self.df.iloc[idx, 1]
        tgt_text = f"<s> {tgt_text} </s>"

        src_encodings = nmt_tokenizer.encode(src_text)
        tgt_encodings = nmt_tokenizer.encode(tgt_text)
        inputs = NmtPair(torch.tensor(src_encodings.ids), torch.tensor(src_encodings.attention_mask), 
                         torch.tensor(tgt_encodings.ids[:-1]), torch.tensor(tgt_encodings.attention_mask[:-1]))
        labels = torch.tensor(tgt_encodings.ids[1:])
        return inputs, labels
    

train_data = TextDataset(df.iloc[:100000, :])
valid_data = TextDataset(df.iloc[100000:120000, :])
test_data = TextDataset(df.iloc[120000:, :])

In [74]:
def collate_batch(batch):
    src_ids, src_mask, tgt_ids, tgt_mask, label_ids = [], [], [], [], []
    for _text, _label in batch:
        src_ids.append(_text[0])
        src_mask.append(_text[1])
        tgt_ids.append(_text[2])
        tgt_mask.append(_text[3])
        label_ids.append(_label)
    label_ids_tensor = nn.utils.rnn.pad_sequence(label_ids, batch_first=True)
    src_ids_tensor = nn.utils.rnn.pad_sequence(src_ids, batch_first=True)
    src_mask_tensor = nn.utils.rnn.pad_sequence(src_mask, batch_first=True)
    tgt_ids_tensor = nn.utils.rnn.pad_sequence(tgt_ids, batch_first=True)
    tgt_mask_tensor = nn.utils.rnn.pad_sequence(tgt_mask, batch_first=True)
    inputs = NmtPair(src_ids_tensor, src_mask_tensor, tgt_ids_tensor, tgt_mask_tensor)
    return inputs, label_ids_tensor

In [75]:
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_batch)
print(len(train_dataloader))
print(len(valid_dataloader))
print(len(test_dataloader))

3125
625
496


# Training

In [79]:
from torch.optim import Adam

def train_epoch(dataloader, model, optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, label_tensor = data
        input_tensor, label_tensor = input_tensor.to(device), label_tensor.to(device)

        # print(f"src {input_tensor[0].shape}, tgt  {input_tensor[2].shape} label {label_tensor.shape}")

        optimizer.zero_grad()
        
        pred = model(input_tensor[0], input_tensor[2])
        # print(pred.shape)
        
        loss = criterion(pred, label_tensor)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

model = EncoderDecoderRNNAttention(vocab_size, 256, 256, 0.1, 256, 256)
model.to(device)

EncoderDecoderRNNAttention(
  (encoder_embedding): Embedding(10000, 256)
  (encoder_rnn): GRU(256, 256, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (decoder_embedding): Embedding(10000, 256)
  (decoder_rnn): GRU(256, 256, batch_first=True)
  (decoder_full): Linear(in_features=512, out_features=10000, bias=True)
)

In [80]:
def train(train_dataloader, model, n_epochs, criterion, learning_rate=0.001):
    print_loss_total = 0  # Reset every print_every

    optimizer = Adam(model.parameters(), lr = learning_rate)

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, model, optimizer, criterion)
        print_loss_total += loss

        if epoch % 2 == 0:
            print_loss_avg = print_loss_total / 2
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (epoch, epoch / n_epochs * 100, print_loss_avg))


train(train_dataloader, model, 10, xentropy)

(2 20%) 1.2371
(4 40%) 0.6497
(6 60%) 0.5106
(8 80%) 0.4393
(10 100%) 0.3980


# Evaluation

In [89]:
def evaluate(model, sentence, tokenizer, max_length = 50):
    model.eval()

    tgt_sentence = f"<s>"
    index = 0
    with torch.no_grad():
        for index in range(max_length):
            src = torch.tensor(tokenizer.encode(sentence).ids, device=device).view(1, -1)
            tgt = torch.tensor(tokenizer.encode(tgt_sentence).ids, device=device).view(1, -1)

            pred = model(src, tgt)

            pred_token_ids = pred.argmax(dim=1)  # find the best token IDs
            next_token_id = pred_token_ids[0, index]  # take the last token ID

            next_token = tokenizer.id_to_token(next_token_id)
            print(next_token)
            tgt_sentence += " " + next_token
            if next_token_id == 3:
                break

    return tgt_sentence

print(evaluate(model, "beautiful", nmt_tokenizer))

beau
coucher
ou
bien
.
</s>
<s> beau coucher ou bien . </s>
