In [1]:
import main, model
from dataset import SeqPairDataset
from model import EncoderDecoder
from tokenizer import Tokenizer

import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from tqdm import tqdm

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [2]:
MAX_LEN = 100

tokenizer = Tokenizer()
# build vocabulary and token_id dicts
tokenizer.from_file("data/toy.json")
tr_data = SeqPairDataset("data/toy.json", tokenizer, MAX_LEN, MAX_LEN)



Success: vocabulary built!

Sentence tokenized!
src: ['"', 'ay', ',', 'that', "'", 's', 'as', 'plain', 'a', 'pike', '-', 'staff', ',"', 'say', 'barbara', ';', 'what', 'else', 'do', 'she', 'mean', ',', 'think', 'you', '?']
tgt: ['"', 'ay', ',', 'that', "'", 's', 'as', 'plain', 'as', 'a', 'pike', '-', 'staff', ',"', 'said', 'barbara', ';', '"', 'but', 'what', 'else', 'did', 'she', 'mean', ',', 'think', 'you', '?']


Special tokens added:
[274, 0, 1, 2, 3, 4, 5, 6, 7, 8]...

Sentence length after padding: 100
Padding tokens added:
...[276, 276, 276, 276, 276, 276, 276, 276, 276, 276]

Special tokens added:
[274, 0, 1, 2, 3, 4, 5, 6, 7, 6]...

Sentence length after padding: 100
Padding tokens added:
...[276, 276, 276, 276, 276, 276, 276, 276, 276, 276]

Decoder input last 3: tensor([276, 276, 276])
Target last 3       : [276, 276, 276]
Decoder input size  : 99

Labels first 3: tensor([0, 1, 2])
Target first 3: [274, 0, 1]
Labels size   : 99

New sample created:
    enc_inp size: 100    dec

In [3]:
dataloader = DataLoader(tr_data, 64, True)


In [4]:
src_vocab_size = len(tokenizer.src_vocab)
tgt_vocab_size = len(tokenizer.tgt_vocab)
pad_id = tokenizer.pad_id

VOCAB_SIZE = len(tokenizer.src_vocab)

model = EncoderDecoder(src_vocab_size=VOCAB_SIZE, 
    tgt_vocab_size=VOCAB_SIZE,
    pad_idx=pad_id, 
    max_len=MAX_LEN
)


In [5]:

loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [6]:
loss = main.train_epoch(
    model,
    dataloader,
    optimizer,
    loss_fn
    )


Training: 100%|██████████| 1/1 [00:00<00:00,  6.81it/s]

Batch 0 processed!    19 samples trained





In [7]:
ts_data = SeqPairDataset("data/toy2.json", tokenizer, MAX_LEN, MAX_LEN)
testloader = DataLoader(ts_data)



Success: vocabulary built!

Sentence tokenized!
src: ['you', 'gingerly', 'rascal', '!']
tgt: ['you', 'gingerly', 'rascal', '!']


Special tokens added:
[274, 22, 277, 277, 277, 275]...

Sentence length after padding: 100
Padding tokens added:
...[276, 276, 276, 276, 276, 276, 276, 276, 276, 276]

Special tokens added:
[274, 22, 277, 277, 277, 275]...

Sentence length after padding: 100
Padding tokens added:
...[276, 276, 276, 276, 276, 276, 276, 276, 276, 276]

Decoder input last 3: tensor([276, 276, 276])
Target last 3       : [276, 276, 276]
Decoder input size  : 99

Labels first 3: tensor([ 22, 277, 277])
Target first 3: [274, 22, 277]
Labels size   : 99

New sample created:
    enc_inp size: 100    dec_inp size: 99    labels size : 99
    enc_inp type: <class 'torch.Tensor'>
    dec_inp type: <class 'torch.Tensor'>
    labels  type: <class 'torch.Tensor'>
-----------------------------------------------


Sentence tokenized!
src: ['miss', 'steele', 'be', 'least', 'discomposed', 'of'

In [8]:
bleu_avg = main.compute_bleu_score(model, testloader, tokenizer)

BLEU Score Calculation:   4%|▍         | 1/25 [00:00<00:08,  3.00it/s]

Batch 0: sample output:PREDICTED: ['and', '<unk>', 'and', 'a', 'and', '<unk>', 'we', '<unk>', '<unk>', 'and', ';', '<unk>', 'engedi', '<unk>', '<unk>', '<unk>', 'and', '<unk>', 'and', '<unk>', '<unk>', 'and', '<unk>', 'and', ';', '<unk>', 'we', '<unk>', 'we', '<unk>', 'we', '<unk>', 'and', '<unk>', 'we', '<unk>', 'and', ';', 'by', ',', 'and', ';', '<unk>', 'and', ';', '<unk>', 'we', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'we', 'for', '<unk>', 'and', 'often', 'jared', '<unk>', '-', '<unk>', '<unk>', '<unk>', 'and', '<unk>', 'engedi', 'for', '<unk>', 'and', '<unk>', '<unk>', '<unk>', 'we', 'yours', '<unk>', 'and', '<unk>', 'we', '<unk>', 'if', '<unk>', 'we', '<unk>', '<unk>', '<unk>', 'for', '<unk>', 'and', ';', '<unk>', 'engedi', '<unk>', 'own', '<unk>', '<unk>', '<unk>', 'we', '<unk>', 'we']
ACTUAL:    ['you', '<unk>', '<unk>', '<unk>']




BLEU Score Calculation:  44%|████▍     | 11/25 [00:03<00:04,  3.06it/s]

Batch 10: sample output:PREDICTED: ['<unk>', '<unk>', 'and', '<unk>', 'and', '<unk>', 'engedi', '<unk>', '<unk>', 'and', '<unk>', 'enoch', 'somehow', 'unto', '<unk>', '<unk>', 'and', '<unk>', '<unk>', '<unk>', '<unk>', 'and', 's', '19', '<unk>', 'and', '<unk>', 'we', '<unk>', 'engedi', '<unk>', '<unk>', 'we', '<unk>', 'engedi', '<unk>', 'and', 'her', 'and', '<unk>', 'and', ';', '<unk>', 'we', '<unk>', 'engedi', 'you', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'we', '<unk>', 'for', '<unk>', 'engedi', '<unk>', 'for', 'writer', '<unk>', '<unk>', '<unk>', 'and', '<unk>', 'engedi', '<unk>', 'we', '<unk>', '<unk>', '<unk>', '<unk>', 'we', 'yours', '<unk>', 'and', '<unk>', 'we', '<unk>', 'if', '<unk>', 'we', '<unk>', '<unk>', '<unk>', 'after', '<unk>', 'and', ';', '<unk>', 'engedi', '<unk>', 'own', '<unk>', '<unk>', '<unk>', 'we', '<unk>', 'we']
ACTUAL:    ['<unk>', ':', '<unk>', '<unk>', '<unk>', '<unk>', 'the', '<unk>', 'one', 'of', '<unk>', ',', '<unk>', '<unk>', '<unk>', 'this', '<unk>

BLEU Score Calculation:  84%|████████▍ | 21/25 [00:06<00:01,  3.43it/s]

Batch 20: sample output:PREDICTED: ['and', '<unk>', 'and', 'a', 'and', '<unk>', 'we', '<unk>', 'we', 'for', '<unk>', 'enoch', 'somehow', 'unto', '<unk>', '<unk>', 'and', '<unk>', 'and', '<unk>', 'we', '<unk>', 'yours', '<unk>', 'and', 'son', ',', 'and', ';', '<unk>', 'we', '<unk>', 'we', '<unk>', 'we', '<unk>', 'and', ';', 'by', ',', 'and', ';', '<unk>', 'we', '<unk>', 'we', 'for', '<unk>', 'by', '<unk>', '<unk>', '<unk>', 'we', 'for', '<unk>', 'and', 'often', 'to', '<unk>', '-', '<unk>', '<unk>', '<unk>', 'and', '<unk>', 'engedi', 'for', '<unk>', 'if', '<unk>', '<unk>', 'we', '<unk>', 'we', '<unk>', 'and', '<unk>', 'we', '<unk>', 'if', '<unk>', 'we', '<unk>', '<unk>', '<unk>', 'for', '<unk>', 'and', ';', '<unk>', 'engedi', '<unk>', 'own', '<unk>', '<unk>', '<unk>', 'we', '<unk>', 'we']
ACTUAL:    ['"', '<unk>', '<unk>']




BLEU Score Calculation: 100%|██████████| 25/25 [00:07<00:00,  3.26it/s]


In [9]:
print('bos_id:', tokenizer.bos_id)
print('eos_id:', tokenizer.eos_id)
print('src_vocab_size:', len(tokenizer.src_vocab))
print('tgt_vocab_size:', len(tokenizer.tgt_vocab))

bos_id: 274
eos_id: 275
src_vocab_size: 278
tgt_vocab_size: 278


In [10]:
bleu_avg

0.14136969696969695

In [11]:
def count_parameters(model):
    # Iterate through all parameters returned by model.parameters()
    # Check if p.requires_grad is True (meaning it is trainable)
    # Sum the number of elements (numel) in each trainable tensor (weight or bias)
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Example usage in your main.py:
total_params = count_parameters(model)
print(f"Total Trainable Parameters: {total_params:,}")

Total Trainable Parameters: 1,058,838
