In [1]:
import torch
from torch import nn
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from datasets import load_dataset
from evaluate import load, combine
from tqdm import tqdm

from transformer import *
from dataset import *

from numba import cuda
device = cuda.get_current_device()
device.reset() 

In [2]:
# get data 
train = load_dataset('csv', data_files='../data/train.csv')['train']
test = load_dataset('csv', data_files='../data/test.csv')['train']
val = test[:200] # random choice for val loss

obolo_tokenizer = PreTrainedTokenizerFast(tokenizer_file='../tokenizers/obolo-bpe-tokenizer.json', padding='left')
english_tokenizer = AutoTokenizer.from_pretrained('gpt2', padding='left')

print(obolo_tokenizer.vocab_size)
print(english_tokenizer.vocab_size)
print(DEVICE)

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = obolo_tokenizer
token_transform[TGT_LANGUAGE] = english_tokenizer

vocab_transform[SRC_LANGUAGE] = obolo_tokenizer.vocab 
vocab_transform[TGT_LANGUAGE] = english_tokenizer.vocab
# for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
#   vocab_transform[ln].set_default_index(UNK_IDX)
init_text_transform(token_transform, vocab_transform)

# now this BPE tokenizer is also equipped with a decoder, so we should be able to do Obolo -> English and English -> Obolo

15866
50257
cuda


In [3]:
# ob_sent, en_sent = train['Obolo'][0], train['English'][0]
# print(ob_sent)
# print(obolo_tokenizer(ob_sent))
# print(text_transform[SRC_LANGUAGE](ob_sent))
# print(obolo_tokenizer(ob_sent)['input_ids'] == list(text_transform[SRC_LANGUAGE](ob_sent))[1:-1])
# print(en_sent)
# print(text_transform[TGT_LANGUAGE](en_sent))

In [4]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = obolo_tokenizer.vocab_size
TGT_VOCAB_SIZE = english_tokenizer.vocab_size
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

train_dataloader = generate_dataloader(train['Obolo'], train['English'], BATCH_SIZE)
val_dataloader = generate_dataloader(val['Obolo'], val['English'], BATCH_SIZE)
test_dataloader = generate_dataloader(test['Obolo'], test['English'], BATCH_SIZE)

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [5]:
display(transformer)
display(optimizer)

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerDecoderLayer(
          (self_attn): MultiheadAttent

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-09
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0
)

In [6]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0

    for src, tgt in tqdm(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

        # save vram 
        del src, tgt
        torch.cuda.empty_cache()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

        # save vram 
        del src, tgt
        torch.cuda.empty_cache()

    return losses / len(list(val_dataloader))

In [7]:
from timeit import default_timer as timer

NUM_EPOCHS = 10

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
100%|██████████| 438/438 [04:18<00:00,  1.70it/s]


Epoch: 1, Train loss: 5.804, Val loss: 4.493, Epoch time = 267.799s


100%|██████████| 438/438 [04:23<00:00,  1.66it/s]


Epoch: 2, Train loss: 4.204, Val loss: 3.918, Epoch time = 272.067s


100%|██████████| 438/438 [04:21<00:00,  1.68it/s]


Epoch: 3, Train loss: 3.761, Val loss: 3.565, Epoch time = 270.154s


100%|██████████| 438/438 [04:23<00:00,  1.66it/s]


Epoch: 4, Train loss: 3.459, Val loss: 3.324, Epoch time = 272.561s


100%|██████████| 438/438 [04:19<00:00,  1.69it/s]


Epoch: 5, Train loss: 3.221, Val loss: 3.140, Epoch time = 268.653s


100%|██████████| 438/438 [04:20<00:00,  1.68it/s]


Epoch: 6, Train loss: 3.024, Val loss: 3.013, Epoch time = 269.985s


100%|██████████| 438/438 [04:21<00:00,  1.67it/s]


Epoch: 7, Train loss: 2.855, Val loss: 2.917, Epoch time = 270.769s


100%|██████████| 438/438 [04:24<00:00,  1.66it/s]


Epoch: 8, Train loss: 2.708, Val loss: 2.841, Epoch time = 273.343s


100%|██████████| 438/438 [04:25<00:00,  1.65it/s]


Epoch: 9, Train loss: 2.577, Val loss: 2.777, Epoch time = 274.892s


100%|██████████| 438/438 [04:23<00:00,  1.66it/s]


Epoch: 10, Train loss: 2.458, Val loss: 2.730, Epoch time = 272.832s


In [9]:
torch.save(transformer.state_dict(), 'transformer_obolo_to_english_bpe_dict.pt')

In [10]:
torch.save(transformer, 'transformer_obolo_to_english_bpe.pt')

In [48]:
m=torch.load('transformer_obolo_to_english_bpe.pt')
m

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerDecoderLayer(
          (self_attn): MultiheadAttent

In [46]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        # print(out)
        # print(english_tokenizer.decode(out))
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        # print(next_word)
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        # save vram
        del tgt_mask
        torch.cuda.empty_cache()
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    
    return token_transform[TGT_LANGUAGE].decode(tgt_tokens[1:-1])
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("[CLS]", "").replace("[SEP]", "")


In [84]:
src_sentence = "ire, emi okumugwem mâtap oke me etete anam ebi ijeren mè echi ebi ijipiti. anam geege me etete echi ebi ijeren ìkpokwu"
src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
print(list(src.numpy().flatten()), src.shape)
print(token_transform[SRC_LANGUAGE].decode(src.numpy().flatten()))

[1, 136, 8, 144, 153, 249, 81, 303, 1950, 56, 340, 347, 75, 235, 56, 55, 291, 75, 463, 10, 347, 270, 56, 340, 291, 75, 235, 49, 55, 4213, 2] torch.Size([31, 1])
[CLS] ire, emi okumugwem mâtap oke me etete anam ebi ijeren mè echi ebi ijipiti. anam geege me etete echi ebi ijeren ìkpokwu[SEP]


MAKE TRANSLATE ACTUALLY WORK !!!

In [51]:
preds = []
refs = []
for idx in tqdm(range(len(test))):
    ob, en = test['Obolo'][idx], test['English'][idx]
    refs.append(en)
    pred = translate(m, ob)
    preds.append(pred)

  0%|          | 0/3110 [00:00<?, ?it/s]

100%|██████████| 3110/3110 [10:14<00:00,  5.06it/s]


In [54]:
print(len(preds), len(refs))

3110 3110


In [55]:
# metrics
chrf = load('chrf')
gleu = load('google_bleu')
rouge = load('rouge') 
bleu = load('bleu')
meteor = load('meteor')
metrics = combine([chrf, bleu, rouge, meteor, gleu])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhiv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhiv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\abhiv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [56]:
scores = metrics.compute(predictions=preds, references=refs)
scores

{'score': 33.04091540787891,
 'char_order': 6,
 'word_order': 0,
 'beta': 2,
 'bleu': 0.10531583607718822,
 'precisions': [0.39940596433941206,
  0.15199285856734504,
  0.06464365746494309,
  0.0313480889373341],
 'brevity_penalty': 1.0,
 'length_ratio': 1.1160628802618981,
 'translation_length': 108411,
 'reference_length': 97137,
 'rouge1': 0.3765356456692016,
 'rouge2': 0.13864376297768582,
 'rougeL': 0.31799077304173684,
 'rougeLsum': 0.3181688900030759,
 'meteor': 0.3184619515531625,
 'google_bleu': 0.1572599755737436}