In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !gdown --fuzzy "https://drive.google.com/file/d/12ycYSzLIG253AFN35Y6qoyf9wtkOjakp/view?usp=sharing"

In [3]:
# !tar -zxvf "./2017-01-trnmted.tgz"
# !tar -zxvf "./2017-01-trnmted/texts/DeEnItNlRo/DeEnItNlRo/DeEnItNlRo-DeEnItNlRo.tgz"
# !mv ./DeEnItNlRo-DeEnItNlRo ./texts
# !mkdir ./clean_texts
# !mkdir ./tokenizers
# !rm "./2017-01-trnmted.tgz"
# !rm -rf "./2017-01-trnmted"

In [2]:
import data
from pathlib import Path

# data.convert_files(Path('./texts'), Path('./clean_texts'))
# data.train_tokenizers(Path('./clean_texts'), Path('./tokenizers'))

In [3]:
from train_model import train_model, train_epoch, translate_test_set

# train_model("./clean_texts", "./tokenizers", 5)

In [4]:
from pathlib import Path

import torch
from sacrebleu.metrics import BLEU
from tokenizers import Tokenizer
from tqdm.auto import trange, tqdm

from data import TranslationDataset
from decoding import translate, get_attn_mask
from model import TranslationModel

import wandb

In [5]:
data_dir = Path("./clean_texts")
tokenizer_path = Path("./tokenizers")
src_tokenizer = Tokenizer.from_file(str(tokenizer_path / "tokenizer_de.json"))
tgt_tokenizer = Tokenizer.from_file(str(tokenizer_path / "tokenizer_en.json"))

In [6]:
config = {
    "batch_size" : 6,
    "lr" : 3e-4,
    "max_len" : 128,  # might be enough at first
    "num_encoder_layers" : 2,
    "num_decoder_layers" : 2,
    "emb_size" : 256,
    "dim_feedforward" : 512,
    "n_head" : 8,
    "dropout_prob" : 0.1,
}


# train_dataset = TranslationDataset(
#     data_dir / "train.de.txt",
#     data_dir / "train.en.txt",
#     src_tokenizer,
#     tgt_tokenizer,
#     max_len=128 #config["max_len"],
# )

# train_dataloader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size=6,
#     collate_fn = train_dataset.collate_translation_data,
#     # shuffle=True,
# )

In [7]:
val_dataset = TranslationDataset(
    data_dir / "val.de.txt",
    data_dir / "val.en.txt",
    src_tokenizer,
    tgt_tokenizer,
    max_len=config["max_len"],
)


val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    config["batch_size"],
    collate_fn = val_dataset.collate_translation_data,
)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

src_pad_id = tgt_tokenizer.token_to_id("[PAD]")
tgt_pad_id = tgt_tokenizer.token_to_id("[PAD]")

model = TranslationModel(
    config["num_encoder_layers"],
    config["num_decoder_layers"],
    config["emb_size"],
    config["dim_feedforward"],
    config["n_head"],
    src_tokenizer.get_vocab_size(),
    tgt_tokenizer.get_vocab_size(),
    config["dropout_prob"],
    src_pad_id,
    tgt_pad_id,
    config["max_len"]
)

model.load_state_dict(torch.load("checkpoint_last.pth")["model_state_dict"])


print("Total no. of model parameters:",
    "pytorch_total_params =", sum(p.numel() for p in model.parameters())
)
model.to(device);

Total no. of model parameters: pytorch_total_params = 25706800


In [9]:
from train_model import train_epoch, evaluate, translate_test_set

# num_epochs=1
# optimizer = torch.optim.Adam(model.parameters(), config["lr"])
# scheduler = torch.optim.lr_scheduler.OneCycleLR(
#     optimizer,
#     config["lr"],
#     steps_per_epoch=len(train_dataloader),
#     epochs=num_epochs,
#     pct_start=0.1
# )
CELoss = torch.nn.CrossEntropyLoss(ignore_index=tgt_pad_id)

min_val_loss = float("inf")

# for epoch in trange(1, num_epochs + 1):
#     val_loss = evaluate(model, train_dataloader, CELoss, device, src_tokenizer, tgt_tokenizer)

In [24]:
@torch.inference_mode()
def evaluate(
    model: TranslationModel,
    val_dataloader,
    CELoss,
    device,
    src_tokenizer,
    tgt_tokenizer,
    logger=None
    ):
    # compute the loss over the entire validation subset
    model.eval()
    model.to(device)

    src_pad_id = src_tokenizer.token_to_id("[PAD]")
    tgt_pad_id = tgt_tokenizer.token_to_id("[PAD]")
    tgt_vocab_size = tgt_tokenizer.get_vocab_size()

    total_loss = 0
    total_size = 0

    for i, batch in enumerate(val_dataloader):
        #getting data 
        src = batch["src"].to(device)
        tgt = batch["tgt"].to(device)
        bs, tgt_len = tgt.shape

        # building masks
        tgt_attn_mask = get_attn_mask(tgt_len-1).to(device)
        src_pad_mask = (src == src_pad_id).to(device)
        tgt_pad_mask = (tgt[:,:-1] == tgt_pad_id).to(device)

        # forward, give target except ["EOS"]
        out = model(tgt[:,:-1], src, tgt_attn_mask, src_pad_mask, tgt_pad_mask)
        if i == 0:
            print('-'*50)
            print(tgt_attn_mask)
            print(tgt_attn_mask.shape)
        else:
            print(i)
            # print(tgt_tokenizer.id_to_token(tgt[0,0]))
            # print(src_tokenizer.decode(src[0].tolist()))
            # print(tgt_tokenizer.decode(out[0].argmax(dim=-1).tolist()))
        # compare to target except ["BOS"]
        out = out.reshape(bs * (tgt_len-1), tgt_vocab_size)
        loss = CELoss(out, tgt[:,1:].reshape(-1))

        # loss calc
        total_loss += loss.cpu().item() * bs
        total_size += bs

    return total_loss / total_size

In [48]:
def get_attn_mask(L):
    inf_mask = torch.triu(torch.ones(L, L, dtype=bool), diagonal=1)
    res = torch.zeros(L, L).masked_fill(inf_mask, -float("inf"))
    return res

In [50]:
get_attn_mask(5)[0]

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])


tensor([0., -inf, -inf, -inf, -inf])

In [43]:
sz = 5
mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [25]:
for epoch in trange(1):
    val_loss = evaluate(model, val_dataloader, CELoss, device, src_tokenizer, tgt_tokenizer, None)
    print(val_loss)

  0%|          | 0/1 [00:00<?, ?it/s]

--------------------------------------------------
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
torch.Size([42, 42])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
0.15737174154954217


In [32]:
src_tokenizer.token_to_id("[PAD]")

1

In [60]:
translate_test_set(model, data_dir, tokenizer_path)

[BOS]
HERE:
[BOS]
tensor([[[-0.0017, -0.0037,  0.0127,  ...,  0.0028, -0.0017,  0.0009],
         [-0.0021, -0.0037,  0.0132,  ...,  0.0028, -0.0017,  0.0010],
         [-0.0033, -0.0040,  0.0104,  ...,  0.0029, -0.0014,  0.0010]],

        [[-0.0017, -0.0037,  0.0127,  ...,  0.0032, -0.0016,  0.0009],
         [-0.0008, -0.0036,  0.0111,  ...,  0.0040, -0.0015,  0.0010],
         [-0.0038, -0.0041,  0.0087,  ...,  0.0023, -0.0014,  0.0010]],

        [[-0.0016, -0.0037,  0.0127,  ...,  0.0027, -0.0017,  0.0010],
         [-0.0018, -0.0037,  0.0130,  ...,  0.0028, -0.0017,  0.0009],
         [-0.0036, -0.0040,  0.0097,  ...,  0.0024, -0.0013,  0.0009]],

        ...,

        [[-0.0016, -0.0037,  0.0127,  ...,  0.0027, -0.0017,  0.0010],
         [-0.0018, -0.0037,  0.0130,  ...,  0.0028, -0.0017,  0.0009],
         [-0.0036, -0.0040,  0.0097,  ...,  0.0024, -0.0013,  0.0009]],

        [[-0.0025, -0.0035,  0.0096,  ...,  0.0015, -0.0016,  0.0007],
         [-0.0011, -0.0036,  0.0023, 

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [19]:
from sacremoses import MosesDetokenizer, MosesPunctNormalizer

In [46]:
# src_tokenizer.token_to_id("[PAD]")
tgt_tokenizer.token_to_id("[PAD]")

1

In [39]:
decoded.split()

['W', 'hat', 'are', 'you', 'do', 'ing', '?']

In [83]:
text = "Hi Madina"
src_tokenizer.encode(text).tokens

['[BOS]', 'Hi', 'Mad', 'ina', '[EOS]']

In [102]:

from tokenizers.processors import TemplateProcessing

src_tokenizer.post_processor = TemplateProcessing(
    single="[EOS] $A [BOS]",
    pair="[EOS] $A [UNK] $B:1 [BOS]:1",
    special_tokens=[
        ("[EOS]", src_tokenizer.token_to_id("[EOS]")),
        ("[BOS]", src_tokenizer.token_to_id("[BOS]")),
        ("[UNK]", src_tokenizer.token_to_id("[UNK]"))
    ],
)

In [134]:
batch_encs = tgt_tokenizer.encode_batch(["Hello, Marina", "Hi Dima!"])

In [135]:
batch_ids = [a.ids for a in batch_encs]
batch_ids

[[2, 6419, 15, 2661, 2791, 3], [2, 4991, 37, 10811, 4, 3]]

In [142]:
decoded = tgt_tokenizer.decode_batch(torch.Tensor(batch_ids).long().tolist())

In [151]:
model

NameError: name 'model' is not defined

In [None]:
translate_test_set(model: TranslationModel, data_dir, tokenizer_path)

In [93]:
detok = MosesDetokenizer()
mpn = MosesPunctNormalizer()

text = "Hello, World!"
encoded = src_tokenizer.encode(text).ids
decoded = src_tokenizer.decode(encoded)
detokenized = detok.detokenize(decoded.split())
normalized = mpn.normalize(decoded)
print(decoded)
print(detokenized)
print(normalized)

Hell o , World !
Hell o, World!
Hell o , World !


In [91]:
doubled = [decoded, decoded]
detok.detokenize(doubled)

'W hat are you do ing? W hat are you do ing?'

In [20]:
seqs = src_tokenizer.decode_batch([train_dataset[7][0].tolist(), train_dataset[8][0].tolist()])
# seqs = "WHAT , ARE YOU ' LL DOING ?"
print(seqs)
print(MosesDetokenizer().detokenize(seqs))
print(MosesPunctNormalizer().normalize(seqs))

['Ich erzähle Ihnen mal eine Geschichte , dann verstehen Sie mich vielleicht besser .', 'Eine wahre Geschichte -- kein Wort daran ist erfunden .']
Ich erzähle Ihnen mal eine Geschichte, dann verstehen Sie mich vielleicht besser. Eine wahre Geschichte -- kein Wort daran ist erfunden.
['Ich erzähle Ihnen mal eine Geschichte , dann verstehen Sie mich vielleicht besser .', 'Eine wahre Geschichte -- kein Wort daran ist erfunden .']


In [21]:
MosesPunctNormalizer().normalize(seqs)

"['Ich erzähle Ihnen mal eine Geschichte , dann verstehen Sie mich vielleicht besser .', 'Eine wahre Geschichte -- kein Wort daran ist erfunden .']"

In [11]:
seqs[0]

NameError: name 'seqs' is not defined

In [12]:
len(train_dataset)

0

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12882 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [65]:
# src

In [69]:
for i in train_dataloader:
    for line in i["tgt"]:
        print(line)
    print(i["tgt"])
    break

tensor([[    2,   698,   175,   215,   467,    15,  2040,    17,     3,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    2,   194,   159,    10,    82,  2691,    64,   589,  4596,   155,
           223,   147,  1869,   155,   451,   155,   197,  1899,  3318,    30,
            42,    10,    76,  2433,  5685,    17,     3,     1,     1,     1,
             1,     1,     1,     1],
        [    2,    42,   223,   426,  7699,   880,   325,   197,  3479,    15,
           163,    42,   358,   155,  2075,   209,   162,   175,   199,   147,
           516,  1936,  7305,   255,   246,    42,   336,   155,   399,   147,
           333,  1227,    17,     3],
        [    2,   194,    42,   399,   168,  1428,  4112,    15,  6879,   341,
            42,   442,   168,    17,     3,     1,     1,     1,     1,     1,
             1,  