In [None]:
%pip install transformers[sentencepiece] datasets

In [None]:
%pip install torchtext

In [None]:
%pip install seaborn



In [None]:
%pip install wandb onnx -Uq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from tqdm import tqdm
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.vocab import vocab

from src.rnn import Encoder, Decoder, RnnAttentionSeq2Seq
from src.sampler import BaseDataset, EqualLengthsBatchSampler
from src.utils import Dict2Class, get_line_count, plot_attention_weights

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [None]:
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("wi_locness", 'wi')

from transformers import AutoTokenizer
model_checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(examples):
    inputs = examples['text']
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        return_offsets_mapping=True
    )

    labels_out = []
    offset_mapping = model_inputs.pop("offset_mapping")
    for i in range(len(model_inputs["input_ids"])):
        example_idx = i

        start_idx = offset_mapping[i][0][0]
        end_idx = offset_mapping[i][-2][1]  # last token is <eos>, so we care about second last tok offset

        edits = examples["edits"][example_idx]

        corrected_text = inputs[example_idx][start_idx:end_idx]

        for start, end, correction in reversed(
            list(zip(edits["start"], edits["end"], edits["text"]))
        ):
            if start < start_idx or end > end_idx:
                continue
            start_offset = start - start_idx  # >= 0
            end_offset = end - start_idx
            if correction == None:
                correction = tokenizer.unk_token
            corrected_text = (
                corrected_text[:start_offset] + correction + corrected_text[end_offset:]
            )

        labels_out.append(corrected_text)

    labels_out = tokenizer(labels_out, max_length=512, truncation=True)
    model_inputs["labels"] = labels_out["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets['train'].column_names
)

# Train-Test split of 90%-10%
dataset_dict = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=0)
tokenized_datasets["train"] = dataset_dict["train"]
tokenized_datasets["test"] = dataset_dict["test"]

X_train = tokenized_datasets["train"]["input_ids"]
Y_train = tokenized_datasets["train"]["labels"]

X_test = tokenized_datasets["test"]["input_ids"]
Y_test = tokenized_datasets["test"]["labels"]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# print(tokenizer.decode(X_train[0]))
# print(tokenizer.decode(Y_train[0]))

In [None]:
# get dataset sequences
text_train = []
corrected_train = []
for idx in range(len(X_train)):
  text_train.append(tokenizer.decode(X_train[idx]))
  corrected_train.append(tokenizer.decode(Y_train[idx]))

text_validation = []
corrected_validation = []
for idx in range(len(X_test)):
  text_validation.append(tokenizer.decode(X_test[idx]))
  corrected_validation.append(tokenizer.decode(Y_test[idx]))
print(text_train)
print(corrected_train)

["I've been start jogging for five years. It is the way I can unwind because my study it's stressful. It gives me a sense of achievement, for these reasons I would like to do every day. I love jogging because it's a way to stay outdor immersed in nature. I think there are not negative side in doing jogging. I have been really on skiing since I was a baby. My mother make me start. Since then every year i go in north Italy to practice. I fell relaxed staying alon near montains and snow.</s>", "what I usually do in my free time. I really like reading many kinds of books, magazine, etc.when the weather is bad, I love sitting in my favorite armchaire,near the fire place and reading.I enioy to hear the rain while I am reading at home... Howhever I like very much walking too.When the weather is good I often go out with my friend for a walk in the countryside or do shopping. I also love watching films at the cinema or on tv.I prefer comedy and romance, but I like triller and drama too. At leas

Create Vocabularies

In [None]:
from collections import Counter, OrderedDict

in_token_counter = Counter()
out_token_counter = Counter()
num_samples = len(text_train)

# vocab for erroneous
with tqdm(total=num_samples) as t:
  for line in text_train:
    line = line.strip()
    try:
      for token in tokenizer.tokenize(line):
        in_token_counter[token] += 1
    except:
      pass
    finally:
      t.update(1)

# vocab for corrected
with tqdm(total=num_samples) as t2:
  for line in corrected_train:
    line = line.strip()
    try:
      for token in tokenizer.tokenize(line):
        out_token_counter[token] += 1
    except:
      pass
    finally:
      t2.update(1)

# sort word freqs and covert ot OrderedDict
in_token_counter_sorted = sorted(in_token_counter.items(), key=lambda x: x[1], reverse=True)
out_token_counter_sorted = sorted(out_token_counter.items(), key=lambda x: x[1], reverse=True)

print("all vocab in ", len(in_token_counter))
print("all vocab out ", len(out_token_counter))
in_token_ordered_dict = OrderedDict(in_token_counter_sorted[:len(in_token_counter)])
out_token_ordered_dict = OrderedDict(out_token_counter_sorted[:len(out_token_counter)])

PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
CLS_TOKEN = "<cls>"
SEP_TOKEN = "<sep>"

SPECIALS = [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, CLS_TOKEN, SEP_TOKEN]

vocab_in = vocab(in_token_ordered_dict, specials=SPECIALS)
vocab_out = vocab(out_token_ordered_dict, specials=SPECIALS)

vocab_in.set_default_index(vocab_in[UNK_TOKEN])
vocab_out.set_default_index(vocab_out[UNK_TOKEN])

print("Size of IN vocabulary: {}".format(len(vocab_in)))
print("Size of OUT vocabulary: {}".format(len(vocab_out)))

100%|██████████| 5/5 [00:00<00:00, 523.75it/s]
100%|██████████| 5/5 [00:00<00:00, 713.41it/s]

all vocab in  436
all vocab out  402
Size of IN vocabulary: 441
Size of OUT vocabulary: 406





In [None]:
# save vocab files
vocab_in_file_name = 'err-wi-train.vocab'
vocab_out_file_name = 'cor-wi-train.vocab'

torch.save(vocab_in, vocab_in_file_name)
torch.save(vocab_out, vocab_out_file_name)

In [None]:
# vectorize seqs

train_samples = []

with tqdm(total=num_samples) as t:
  for idx,line in enumerate(text_train):
    err_line = line.strip()
    cor_line = corrected_train[idx].strip()
    try:
      err_vec = vocab_in.lookup_indices(tokenizer.tokenize(err_line))
      cor_vec = vocab_out.lookup_indices(tokenizer.tokenize(cor_line))
      train_samples.append((err_vec, cor_vec))
    except:
      pass
    finally:
      t.update(1)

print(len(train_samples))
print(train_samples[0])

validation_samples = []

with tqdm(total=len(text_validation)) as t:
  for idx,line in enumerate(text_validation):
    err_line = line.strip()
    cor_line = corrected_validation[idx].strip()
    try:
      err_vec = vocab_in.lookup_indices(tokenizer.tokenize(err_line))
      cor_vec = vocab_out.lookup_indices(tokenizer.tokenize(cor_line))
      validation_samples.append((err_vec, cor_vec))
    except:
      pass
    finally:
      t.update(1)
print(len(validation_samples))
print(validation_samples[0])

100%|██████████| 5/5 [00:00<00:00, 456.76it/s]


5
([6, 23, 162, 93, 68, 7, 50, 69, 70, 19, 163, 164, 8, 94, 17, 10, 95, 6, 28, 165, 166, 20, 14, 71, 21, 23, 26, 167, 8, 94, 168, 33, 7, 11, 169, 15, 170, 9, 19, 96, 171, 6, 172, 22, 12, 51, 97, 98, 8, 6, 42, 7, 50, 69, 70, 20, 21, 23, 26, 7, 11, 95, 12, 173, 99, 52, 174, 175, 176, 13, 177, 8, 6, 53, 37, 54, 100, 178, 179, 13, 101, 7, 50, 69, 70, 8, 6, 55, 93, 24, 18, 180, 181, 6, 27, 7, 11, 182, 8, 102, 183, 184, 33, 68, 8, 185, 186, 97, 103, 7, 56, 57, 13, 187, 188, 12, 189, 8, 6, 190, 191, 192, 193, 72, 194, 195, 196, 16, 197, 8, 3], [7, 17, 149, 85, 9, 60, 61, 62, 21, 150, 151, 6, 51, 22, 12, 86, 7, 31, 152, 153, 8, 18, 14, 87, 39, 154, 6, 51, 155, 32, 9, 10, 156, 15, 157, 6, 52, 88, 158, 8, 7, 159, 19, 11, 23, 63, 89, 90, 6, 7, 40, 9, 60, 61, 62, 18, 23, 17, 28, 9, 10, 86, 11, 160, 161, 8, 162, 163, 13, 164, 6, 7, 53, 41, 39, 165, 166, 167, 11, 9, 60, 61, 62, 6, 7, 64, 85, 24, 168, 169, 170, 7, 20, 9, 10, 171, 6, 91, 172, 173, 32, 92, 6, 174, 175, 8, 89, 93, 8, 7, 42, 11, 176, 177

  0%|          | 0/300 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 300/300 [00:00<00:00, 657.38it/s]

300
([1, 83, 1, 85, 14, 63, 67, 9, 1, 395, 249, 8, 6, 42, 1, 9, 394, 6, 1, 36, 1, 20, 21, 17, 158, 12, 1, 7, 11, 1, 12, 36, 1, 13, 1, 8, 6, 22, 1, 20, 21, 17, 1, 15, 10, 1, 67, 8, 1, 23, 26, 1, 85, 10, 1, 15, 41, 1, 8, 6, 53, 41, 1, 1, 33, 12, 249, 1, 8, 1, 1, 1, 12, 36, 1, 67, 9, 1, 17, 14, 375, 1, 8, 94, 17, 342, 382, 67, 12, 36, 19, 1, 396, 146, 36, 390, 67, 1, 8, 1, 9, 239, 1, 26, 28, 1, 16, 120, 10, 1, 1, 8, 1, 54, 10, 1, 85, 1, 9, 14, 63, 67, 8, 3], [373, 76, 1, 77, 14, 57, 59, 8, 23, 135, 106, 1, 6, 7, 40, 1, 8, 368, 7, 1, 37, 1, 18, 23, 22, 147, 11, 1, 9, 10, 1, 11, 37, 1, 13, 1, 6, 7, 19, 1, 18, 23, 22, 1, 15, 12, 1, 1, 6, 1, 17, 28, 1, 77, 12, 1, 15, 38, 1, 6, 7, 53, 38, 1, 1, 32, 11, 106, 1, 6, 373, 1, 1, 11, 37, 1, 59, 8, 1, 22, 14, 353, 1, 6, 51, 22, 324, 358, 59, 11, 37, 21, 1, 370, 359, 369, 1, 364, 59, 1, 6, 1, 8, 227, 1, 28, 31, 1, 16, 225, 12, 1, 1, 6, 304, 22, 9, 10, 1, 77, 1, 8, 14, 57, 59, 6, 3])





Convert Sequence pairs into list of input and target tensors

In [None]:
len_train = len(train_samples)
X_train = [ torch.LongTensor(err) for (err, _) in train_samples[:len_train] ]
Y_train = [ torch.LongTensor(cor) for (_, cor) in train_samples[:len_train] ]

len_validation = len(validation_samples)
X_validation = [ torch.LongTensor(err) for (err, _) in validation_samples[:len_validation] ]
Y_validation = [ torch.LongTensor(cor) for (_, cor) in validation_samples[:len_validation] ]

train_samples = None
validation_samples = None

In [None]:
batch_size = 512

dataset_train = BaseDataset(X_train, Y_train)
sampler_train = EqualLengthsBatchSampler(batch_size, X_train, Y_train)
loader_train = DataLoader(dataset_train, batch_sampler=sampler_train, shuffle=False, drop_last=False)

dataset_test = BaseDataset(X_validation, Y_validation)
sampler_test = EqualLengthsBatchSampler(1, X_validation, Y_validation)
loader_test = DataLoader(dataset_test, batch_sampler=sampler_test, shuffle=False, drop_last=False)

Create Model

In [None]:
import wandb

wandb.login()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlethiciars[0m ([33may2324s2-cs4248-team-47[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
params = {
    "device": device,                            # as the decoder also generates sentence it mus be able to move the data to the correct device
    "vocab_size_encoder": len(vocab_in),        # the size of the source vocabulary determines the input size of the encoder embedding
    "vocab_size_decoder": len(vocab_out),        # the size of the target vocabulary determines the input size of the decoder embedding
    "embed_size": 300,                           # size of the word embeddings (here the same for encoder and decoder; but not mandatory)
    "rnn_cell": "GRU",                          # in practice GRU or LSTM will always outperform RNN
    "rnn_hidden_size": 512,                      # size of the hidden state
    "rnn_num_layers": 2,                         # 1 or 2 layers are most common; more rarely sees any benefit
    "rnn_dropout": 0.2,                          # only relevant if rnn_num_layers > 1
    "rnn_encoder_bidirectional": True,           # The encoder can be bidirectional; the decoder can not
    "linear_hidden_sizes": [1024, 2048],         # list of sizes of subsequent hidden layers; can be [] (empty); only relevant for the decoder
    "linear_dropout": 0.2,                       # if hidden linear layers are used, we can also include Dropout; only relevant for the decoder
    "attention": "DOT",                          # Specify if attention should be used; only "DOT" supported; None if no attention
    "teacher_forcing_prob": 0.5,                 # Probability of using Teacher Forcing during training by the decoder
    "special_token_unk": vocab_out['<unk>'],     # Index of special token <UNK>
    "special_token_sos": vocab_out['<s>'],     # Index of special token <SOS>
    "special_token_eos": vocab_out['</s>'],     # Index of special token <EOS>
    "clip": 1.0                                  # Clipping value to limit gradients to prevent exploding gradients
}

wandb.init(project='gec-baseline-gru-rnn', config=params)

params = Dict2Class(params)
# Create model (incl. the definition of the loss function)
model = RnnAttentionSeq2Seq(params, nn.CrossEntropyLoss()).to(device)
# Define optimizers (for encoder and decoder)
encoder_optimizer = optim.Adam(model.encoder.parameters(), lr=0.0005)
decoder_optimizer = optim.Adam(model.decoder.parameters(), lr=0.0005)

In [None]:
def train_batch(model, encoder_optimizer, decoder_optimizer, X, Y):
    batch_size, num_steps = X.shape

    loss = model(X, Y)

    # Backpropagation
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.encoder.parameters(), model.encoder.params.clip)
    torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), model.decoder.params.clip)
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / (num_steps)

def train(model, loader, encoder_optimizer, decoder_optimizer, num_epochs, verbose=False):
    wandb.watch(model, log="all", log_freq=10)
    # Set model to "train" mode
    model.train()

    print("Total Training Time (total number of epochs: {})".format(num_epochs))
    for epoch in range(1, num_epochs+1):

        # Initialize epoch loss (cummulative loss fo all batchs)
        epoch_loss = 0.0

        with tqdm(total=len(loader)) as progress_bar:

            for X_batch, Y_batch in loader:
                batch_size, seq_len = X_batch.shape[0], X_batch.shape[1]

                # Add EOS token to all sequences in that batch
                eos = torch.LongTensor([model.encoder.params.special_token_eos]*batch_size)
                X_batch = torch.cat((X_batch, eos.reshape(-1, 1)), axis=1)
                Y_batch = torch.cat((Y_batch, eos.reshape(-1, 1)), axis=1)

                # Move the batch to the correct device
                X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

                # Train batch and get batch loss
                batch_loss = train_batch(model, encoder_optimizer, decoder_optimizer, X_batch, Y_batch)

                # Update epoch loss given als batch loss
                epoch_loss += batch_loss

                # Update progress bar
                progress_bar.update(batch_size)

        if verbose is True:
            print("Loss:\t{:.3f} (epoch {})".format(epoch_loss, epoch))
            epoch_loss_value = round(epoch_loss, 5)
            wandb.log({"epoch": epoch, "loss": epoch_loss_value})

        if epoch % 5 == 0:
          filename = f"gru_{epoch}.pt"
          torch.save(model.state_dict(), filename)
          print("saved gru at epoch = ", epoch)

In [None]:
num_epochs = 20

train(model, loader_train, encoder_optimizer, decoder_optimizer, num_epochs, verbose=True)

total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")

Total Training Time (total number of epochs: 1)


100%|██████████| 5/5 [00:06<00:00,  1.32s/it]

Loss:	27.282 (epoch 1)
Number of parameters: 28745258





In [None]:
# Save model
action = "save"
#action = "load"
#action = "none"

if action == "save":
    torch.save(model.state_dict(), 'wi-rnn.pt')
elif action == 'load':
    model = RnnAttentionSeq2Seq(params, nn.CrossEntropyLoss()).to(device)
    model.load_state_dict(torch.load('wi-rnn.pt'))
else:
    pass

In [None]:
# import torch.onnx
# torch.onnx.export(model, images, "model.onnx")
# wandb.save("model.onnx")

Testing the model

In [None]:
def translate(model, inputs, max_len=100):
    # Encode input sequence/sentence
    encoder_outputs, encoder_hidden = model.encoder(inputs)
    # Translate input but generating/predicting the output sequence/sentence
    decoded_indices, attention_weights = model.decoder.generate(encoder_hidden, encoder_outputs, max_len=max_len)
    # Return the translation + the attention weights
    return decoded_indices, attention_weights

In [None]:
# for idx, (inputs, targets) in enumerate(loader_test):
#     # The input is the first sequence
#     inputs = inputs[0:1].to(device)
#     # Decode input sequence of indices to sequences of word/tokens
#     src_labels = vocab_in.lookup_tokens(inputs[0].cpu().numpy().tolist())

#     # Translate input sequence into predicted target sequence
#     decoded_indices, attention_weights = translate(model, inputs)

#     # Decode target sequence of indices to sequences of word/tokens
#     tgt_labels = vocab_out.lookup_tokens(decoded_indices)

#     # Print input and translation
#     print(' '.join(src_labels))
#     print()
#     print(' '.join(tgt_labels))

#     # Break the loop; we only want to check a single batch with a single sentence
#     break

▁The ▁topic ▁itself ▁explaining ▁the ▁uncertainty ▁of ▁public ▁transport ▁in ▁our ▁country . This ▁conclusion ▁become ▁more ▁prominent ▁ if ▁we ▁look ▁into ▁the ▁data ▁of ▁the ▁car ▁companies ▁and ▁exponential ▁growth ▁in ▁their ▁sales ▁figure ▁and ▁with ▁low ▁budget ▁private ▁cars ▁in ▁picture , ▁scenario ▁ d d <unk> ically ▁changed ▁in ▁past ▁10 ▁years ▁ </s>

▁The ▁topic ▁becomes ▁admitted ▁the ▁intention ▁of ▁public ▁transport ▁in ▁our ▁country . ▁This ▁is ▁more ▁important ▁ <unk> ▁ . ▁ ▁ ▁to ▁ ▁the ▁the ▁the ▁the ▁the ▁the ▁and ▁to ▁and ▁and ▁and ▁to ▁and ▁to , , ▁in , , ▁in ▁the . ▁ . ▁the . .


In [None]:
all_vocab = tokenizer.get_vocab()
print(len(tokenizer))

correct_sents= []
result_sents= []


# post processing to get final sentence from decoded outputs
for idx, (inputs, targets) in enumerate(loader_test):
    # The input is the first sequence
    inputs = inputs[0:1].to(device)
    # Decode input sequence of indices to sequences of word/tokens
    src_labels = vocab_in.lookup_tokens(inputs[0].cpu().numpy().tolist())

    # Translate input sequence into predicted target sequence
    decoded_indices, attention_weights = translate(model, inputs)

    # Decode target sequence of indices to sequences of word/tokens
    tgt_labels = vocab_out.lookup_tokens(decoded_indices)

    # Print input and translation
    print(' '.join(src_labels))
    print()
    print(' '.join(tgt_labels))
    print()

    # map to t5 tokenizer encodings
    src_indices = []
    for i, label in enumerate(src_labels):
        if label == SOS_TOKEN:
          src_indices.append(all_vocab["<pad>"])
        elif label in all_vocab:
          src_indices.append(all_vocab[label])
        else :
          src_indices.append(all_vocab["<unk>"])

    tgt_indices = []
    for i, label in enumerate(tgt_labels):
        if label == SOS_TOKEN:
          src_indices.append(all_vocab["<pad>"])
        elif label in all_vocab:
          src_indices.append(all_vocab[label])
        else :
          src_indices.append(all_vocab["<unk>"])

    print(tokenizer.decode(src_indices))
    print()
    print(tokenizer.decode(tgt_indices))

    result_sents.append(tokenizer.decode(tgt_indices)) # hypothesis

    # get correct sentence from dataset (for references)
    targets = targets[0:1].to(device)

    # Decode input sequence of indices to sequences of word/tokens
    target_labels = vocab_out.lookup_tokens(targets[0].cpu().numpy().tolist())
    label_indices = []
    for i, label in enumerate(target_labels):
        if label == SOS_TOKEN:
            src_indices.append(all_vocab["<pad>"])
        elif label in all_vocab:
            label_indices.append(all_vocab[label])
        else :
            label_indices.append(all_vocab["<unk>"])

    correct_sents.append(tokenizer.decode(label_indices)) # save references
    print(tokenizer.decode(label_indices))


    # Break the loop; we only want to check a single batch with a single sentence
    break

32100
▁Dear ▁all , ▁I <unk> ▁your <unk> <unk> <unk> <unk> ▁in ▁the <unk> ▁and ▁I ▁am ▁very <unk> ▁in <unk> ▁in ▁your <unk> <unk> . ▁I ▁think ▁I ▁am ▁the <unk> <unk> ▁for ▁you . ▁My <unk> ▁is <unk> a ▁and ▁I ▁am <unk> ▁years <unk> . ▁I ▁am <unk> ▁the <unk> <unk> <unk> ▁for ▁the <unk> ▁year . ▁For <unk> <unk> ▁I ▁have <unk> ▁been ▁on ▁the <unk> ▁position ▁on ▁ a <unk> ▁and ▁looked ▁after ▁the <unk> <unk> <unk> <unk> . ▁I ▁usually <unk> ▁games ▁with <unk> ▁and <unk> ▁to <unk> ▁for ▁the <unk> <unk> . ▁In ▁the <unk> s ▁we <unk> <unk> ▁games ▁which ▁we ▁usually ▁had ▁not <unk> <unk> <unk> . <unk> ▁I <unk> ▁to <unk> ▁my <unk> <unk> ▁I <unk> ▁to ▁go ▁with ▁you ▁on ▁ a <unk> . ▁I <unk> ▁that ▁the <unk> ▁which <unk> ▁your <unk> ▁are <unk> ▁all <unk> ▁the ▁world . ▁My <unk> ▁dream ▁is ▁to <unk> <unk> <unk> <unk> ly . ▁I ▁am ▁ a ▁very ▁good ▁hard <unk> ▁and ▁I ▁do ▁not ▁have ▁any <unk> ▁with <unk> ▁in <unk> s ▁and ▁ <unk> ▁the <unk> ▁for ▁the <unk> ▁and ▁my <unk> . ▁I ▁am ▁looking ▁forward ▁to ▁yo

In [None]:
import csv

def write_expected_actual_to_csv(expected, actual, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Expected', 'Actual'])
        for exp, act in zip(expected, actual):
            writer.writerow([exp, act])

# Example usage:
expected_list = correct_sents
actual_list = result_sents
output_file = "validation_results.csv"

write_expected_actual_to_csv(expected_list, actual_list, output_file)

In [None]:
weights = attention_weights.detach().cpu().numpy()

print(src_labels)
print(tgt_labels)

plot_attention_weights(weights, src_labels, tgt_labels)

In [None]:
wandb.finish()