# Baseline

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=4


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import transformers
import evaluate

%config InlineBackend.figure_format = "svg"
plt.rcParams["figure.figsize"] = 10, 6

SEED = 44
torch.manual_seed(SEED)
np.random.seed(SEED)

## Data

In [3]:
data = pd.read_csv("data/rus.txt", sep="\t", names=["en", "ru", "attribution"])

In [4]:
data.sample(5, random_state=SEED)

Unnamed: 0,en,ru,attribution
397801,Tom doesn't want to live in the country.,Фома не хочет жить в сельской местности.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
117848,You'd better sit here.,Вам лучше сесть здесь.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
238444,I read about it in the paper.,Я прочёл об этом в газете.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
191284,Where are you going to go?,Куда ты собираешься идти?,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
295363,I'd like to know the exact time.,Я хотел бы знать точное время.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [5]:
#data = data.sample(1000, random_state=SEED)

In [6]:
trainval, test = train_test_split(data, test_size=0.2, random_state=SEED)
train, val = train_test_split(trainval, test_size=0.2, random_state=SEED)

In [7]:
model_name = "cointegrated/rut5-base"

tokenizer = transformers.T5Tokenizer.from_pretrained(model_name)

def tokenize_data(data, tokenizer): 
    english_tokenized = None if "en" not in data else tokenizer(data["en"].tolist(), truncation=True, max_length=36)
    russian_tokenized = None if "ru" not in data else tokenizer(data["ru"].tolist(), truncation=True, max_length=36)

    return english_tokenized, russian_tokenized

In [None]:
%%time
tokenized = {
    "train": tokenize_data(train, tokenizer),
    "val": tokenize_data(val, tokenizer),
    "test": tokenize_data(test, tokenizer),
}

In [None]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, english, russian):
        super().__init__()
        self.english = english
        self.russian = russian
        
        assert english is not None or russian is not None
    
    def __getitem__(self, index):
        item = {}

        if self.english is not None:
            item["input_ids"] = torch.tensor(self.english["input_ids"][index], dtype=torch.long)
            item["attention_mask"] = torch.tensor(self.english["attention_mask"][index])
        
        if self.russian is not None:
            item["labels"] = torch.tensor(self.russian["input_ids"][index], dtype=torch.long)
            item["labels_attention_mask"] = torch.tensor(self.russian["attention_mask"][index])

        assert len(item) > 0
        
        return item

    def __len__(self):
        return len(self.english["input_ids"]) if self.english is not None else len(self.russian["input_ids"])

In [None]:
train_ds = TranslationDataset(*tokenized["train"])
val_ds = TranslationDataset(*tokenized["val"])
test_ds = TranslationDataset(*tokenized["test"])

In [None]:
train_ds[3]

In [None]:
tokenizer.all_special_ids

In [None]:
tokenizer.all_special_tokens

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collator(examples):
    #print("N examples:", len(examples))
    #print("First example:", examples[0])
    
    batch_keys = examples[0].keys()

    batch = {key: pad_sequence([sample[key] for sample in examples], batch_first=True, padding_value=0)
            for key in batch_keys}

    #print("Keys after batching:", batch.keys())
    #print("After batching:", len(batch["english_input_ids"]))

    return batch

In [None]:
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collator)

for batch in train_loader:
    break

In [None]:
batch["input_ids"].shape,\
batch["attention_mask"].shape,\
batch["labels"].shape,\
batch["labels_attention_mask"].shape,

## Model

In [None]:
device = torch.device("cuda:0")

In [None]:
#model = transformers.T5ForConditionalGeneration.from_pretrained(model_name)

# we need randomly initilialised model
model = transformers.T5ForConditionalGeneration(transformers.T5Config.from_pretrained(model_name)).to(device)
pass

In [None]:
dummy_tokens = torch.arange(batch_size * 50).reshape(batch_size, 50).to(device)
dummy_mask = torch.ones_like(dummy_tokens).to(device)
dummy_prefix = torch.arange(batch_size * 35).reshape(batch_size, 35).to(device) + 22

out = model(input_ids=dummy_tokens, attention_mask=dummy_mask, labels=dummy_prefix)

In [None]:
out.loss

In [None]:
del model, out, dummy_mask, dummy_tokens, dummy_prefix

## Training utils (not used yet)

In [None]:
def shift_right(tensor, pad_token_id):
    shifted_tensor = tensor.new_zeros(tensor.shape)
    shifted_tensor[..., 1:] = tensor[..., :-1].clone()
    shifted_tensor[..., 0] = pad_token_id

    return shifted_tensor

class CustomTrainer(transformers.Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        english_input_ids = inputs.get("input_ids")
        russian_input_ids = inputs.get("labels")
        english_attention_mask = inputs.get("attention_mask")
        russian_attention_mask = inputs.get("labels_attention_mask")
        
        en_ru = model(
            input_ids=english_input_ids,
            attention_mask=english_attention_mask,
            labels=russian_input_ids,
            decoder_attention_mask=shift_right(russian_attention_mask, pad_token_id=0),
        )

        ru_en = model(
            input_ids=russian_input_ids,
            attention_mask=russian_attention_mask,
            labels=english_input_ids,
            decoder_attention_mask=shift_right(english_attention_mask, pad_token_id=0),   
        )

        loss = ru_en.loss + en_ru.loss

        return (loss, {"ru_en": ru_en, "en_ru": en_ru}) if return_outputs else loss

## Evaluation utils

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

## Training

In [None]:
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir=f"./results/baseline-{model_name}",
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=max(1, 16 // batch_size),
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_steps=10,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_total_limit=1,
    seed=SEED,
    data_seed=SEED,
    fp16=True, 
    #remove_unused_columns=False,
)

In [None]:
model = transformers.T5ForConditionalGeneration(transformers.T5Config.from_pretrained(model_name)).to(device)

opt = torch.optim.AdamW(
    model.parameters(),
    lr=training_args.learning_rate, 
    weight_decay=training_args.weight_decay
)

scheduler = transformers.get_cosine_schedule_with_warmup(optimizer=opt, 
    num_warmup_steps=len(train_loader) * training_args.num_train_epochs // 5, 
    num_training_steps=len(train_loader) * training_args.num_train_epochs,
)

In [None]:
trainer = transformers.Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    data_collator=collator, 
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    optimizers=(opt, scheduler),
    compute_metrics=compute_metrics,
)

In [None]:
#trainer.train(ignore_keys_for_eval=["labels_attention_mask"])
trainer.train()