# Baseline

In [1]:
# %env CUDA_DEVICE_ORDER=PCI_BUS_ID
# %env CUDA_VISIBLE_DEVICES=4

In [2]:
!pip uninstall transformers
!pip uninstall sentencepiece
!pip uninstall accelerate

[0m

In [3]:
!pip install transformers==4.29
!pip install sentencepiece
!pip install accelerate
!pip install sacrebleu
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.29
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.29)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.29)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiproce

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import transformers
import evaluate

%config InlineBackend.figure_format = "svg"
plt.rcParams["figure.figsize"] = 10, 6

SEED = 44
torch.manual_seed(SEED)
np.random.seed(SEED)

In [15]:
# !mkdir data
# !mv rus-eng.zip data
cd data


/content/data


In [16]:
!unzip rus-eng.zip


Archive:  rus-eng.zip
  inflating: rus.txt                 
  inflating: _about.txt              


In [17]:
cd ../

/content


## Data

In [2]:
data = pd.read_csv("data/rus.txt", sep="\t", names=["en", "ru", "attribution"])

In [3]:
data.sample(5, random_state=SEED)

Unnamed: 0,en,ru,attribution
397801,Tom doesn't want to live in the country.,Фома не хочет жить в сельской местности.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
117848,You'd better sit here.,Вам лучше сесть здесь.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
238444,I read about it in the paper.,Я прочёл об этом в газете.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
191284,Where are you going to go?,Куда ты собираешься идти?,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
295363,I'd like to know the exact time.,Я хотел бы знать точное время.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [11]:
data = data.sample(1000, random_state=SEED)

In [15]:
trainval, test = train_test_split(data, test_size=0.2, random_state=SEED)
train, val = train_test_split(trainval, test_size=0.2, random_state=SEED)

In [16]:
model_name = "cointegrated/rut5-base"

tokenizer = transformers.T5Tokenizer.from_pretrained(model_name)

def tokenize_data(data, tokenizer): 
    english_tokenized = None if "en" not in data else tokenizer(data["en"].tolist(), truncation=True, max_length=36)
    russian_tokenized = None if "ru" not in data else tokenizer(data["ru"].tolist(), truncation=True, max_length=36)

    return english_tokenized, russian_tokenized

In [17]:
%%time
tokenized = {
    "train": tokenize_data(train, tokenizer),
    "val": tokenize_data(val, tokenizer),
    "test": tokenize_data(test, tokenizer),
}

CPU times: user 256 ms, sys: 830 µs, total: 257 ms
Wall time: 268 ms


In [18]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, english, russian):
        super().__init__()
        self.english = english
        self.russian = russian
        
        assert english is not None or russian is not None
    
    def __getitem__(self, index):
        item = {}

        if self.english is not None:
            item["input_ids"] = torch.tensor(self.english["input_ids"][index], dtype=torch.long)
            item["attention_mask"] = torch.tensor(self.english["attention_mask"][index])
        
        if self.russian is not None:
            item["labels"] = torch.tensor(self.russian["input_ids"][index], dtype=torch.long)
            item["labels_attention_mask"] = torch.tensor(self.russian["attention_mask"][index])

        assert len(item) > 0
        
        return item

    def __len__(self):
        return len(self.english["input_ids"]) if self.english is not None else len(self.russian["input_ids"])

In [19]:
train_ds = TranslationDataset(*tokenized["train"])
val_ds = TranslationDataset(*tokenized["val"])
test_ds = TranslationDataset(*tokenized["test"])

In [20]:
train_ds[3]

{'input_ids': tensor([2502,  339, 2459,  259, 2753,  288, 2008, 1169,  291,    1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([2117, 8996, 7886, 1167,  291,    1]),
 'labels_attention_mask': tensor([1, 1, 1, 1, 1, 1])}

In [21]:
tokenizer.all_special_ids

[1, 2, 0]

In [22]:
tokenizer.all_special_tokens

['</s>', '<unk>', '<pad>']

In [23]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<eng2ru>', '<ru2eng>']}, )
tokenizer.all_special_ids, tokenizer.all_special_tokens

([1, 2, 0, 30000, 30001], ['</s>', '<unk>', '<pad>', '<eng2ru>', '<ru2eng>'])

In [24]:
tokenizer.special_tokens_map['additional_special_tokens']

['<eng2ru>', '<ru2eng>']

In [25]:
tokenizer('<eng2ru>')

{'input_ids': [30000, 1], 'attention_mask': [1, 1]}

In [26]:
tokenizer.convert_tokens_to_ids('<eng2ru>')

30000

In [27]:
from torch.nn.utils.rnn import pad_sequence

def collator(examples):
    #print("N examples:", len(examples))
    #print("First example:", examples[0])
    
    batch_keys = examples[0].keys()

    batch = {key: pad_sequence([sample[key] for sample in examples], batch_first=True, padding_value=0)
            for key in batch_keys}

    #print("Keys after batching:", batch.keys())
    #print("After batching:", len(batch["english_input_ids"]))

    return batch

In [28]:
batch_size = 1
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collator)

for batch in train_loader:
    break

In [29]:
batch["input_ids"].shape,\
batch["attention_mask"].shape,\
batch["labels"].shape,\
batch["labels_attention_mask"].shape,

(torch.Size([1, 10]),
 torch.Size([1, 10]),
 torch.Size([1, 8]),
 torch.Size([1, 8]))

## Model

In [30]:
torch.cuda.is_available()

False

In [31]:
device = torch.device("cpu")

In [32]:
#model = transformers.T5ForConditionalGeneration.from_pretrained(model_name)

# we need randomly initilialised model
model = transformers.T5ForConditionalGeneration(transformers.T5Config.from_pretrained(model_name)).to(device)
pass

In [33]:
dummy_tokens = torch.arange(batch_size * 450).reshape(batch_size, 450).to(device)
dummy_mask = torch.ones_like(dummy_tokens).to(device)
dummy_prefix = torch.arange(batch_size * 35).reshape(batch_size, 35).to(device) + 22

out = model(input_ids=dummy_tokens, attention_mask=dummy_mask, labels=dummy_prefix)

In [34]:
tokenizer("мама мыла раму")

{'input_ids': [15806, 2073, 1200, 5018, 354, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [35]:
generated = model.generate(input_ids = torch.tensor(tokenizer("мама мыла раму")['input_ids']).reshape(1, -1), 
               attention_mask = torch.tensor(tokenizer("мама мыла раму")['attention_mask']).reshape(1, -1),)

generated



tensor([[    0,   137,  5369, 12147, 28277,  2265, 20250,  6388,  6388, 26262,
          6388,  1063,  5082, 24154, 13712, 27545,  5082, 12537,   952,  6388]])

In [36]:
del model, out, dummy_mask, dummy_tokens, dummy_prefix

## Training utils (not used yet)

In [37]:
def shift_right(tensor, pad_token_id = 0):
    # print(tensor)
    shifted_tensor = tensor.new_zeros(tensor.shape)
    shifted_tensor[..., 1:] = tensor[..., :-1].clone()
    shifted_tensor[..., 0] = pad_token_id

    return shifted_tensor

class CustomTrainer(transformers.Seq2SeqTrainer):


    def compute_loss(self, model, inputs, return_outputs=False):
        # tokens ids
        eng2ru = self.tokenizer.convert_tokens_to_ids('<eng2ru>')
        ru2eng = self.tokenizer.convert_tokens_to_ids('<ru2eng>')
        pad = self.tokenizer.convert_tokens_to_ids('<pad>')

        english_input_ids = inputs.get("input_ids")
        russian_input_ids = inputs.get("labels")
        english_attention_mask = inputs.get("attention_mask")
        russian_attention_mask = inputs.get("labels_attention_mask")

        eng2ru_generated = model.generate(input_ids = shift_right(english_input_ids, eng2ru), 
                                           attention_mask = english_attention_mask)
        
        ru2eng_generated = model.generate(input_ids = shift_right(russian_input_ids, ru2eng), 
                                           attention_mask = russian_attention_mask)
        
        eng2eng_generated = model.generate(input_ids = shift_right(english_input_ids, ru2eng), 
                                           attention_mask = english_attention_mask)
        
        ru2ru_generated = model.generate(input_ids = shift_right(russian_input_ids, eng2ru), 
                                           attention_mask = russian_attention_mask)
        
        # Cycle Consistency loss
        # (eng -> rus) -> eng
        en_cyc_loss = model(
            input_ids=shift_right(eng2ru_generated, ru2eng),
            attention_mask=torch.ones_like(eng2ru_generated),
            labels=shift_right(english_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(english_attention_mask)),
        ).loss

        # (rus -> eng) -> rus
        ru_cyc_loss = model(
            input_ids=shift_right(ru2eng_generated, eng2ru),
            attention_mask=torch.ones_like(ru2eng_generated),
            labels=shift_right(russian_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(russian_attention_mask)),
        ).loss

        # Identity loss (optional )
        # rus -> rus
        ru_id_loss = model(
            input_ids=shift_right(russian_input_ids, eng2ru),
            attention_mask=shift_right(russian_attention_mask),
            labels=shift_right(russian_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(russian_attention_mask)),
        ).loss

        # eng -> eng
        en_id_loss = model(
            input_ids=shift_right(english_input_ids, ru2eng),
            attention_mask=shift_right(english_attention_mask),
            labels=shift_right(english_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(english_attention_mask)),
        ).loss

        # usual loss
        en_ru = model(
            input_ids=shift_right(english_input_ids, eng2ru),
            attention_mask=shift_right(english_attention_mask),
            labels=shift_right(russian_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(russian_attention_mask)),
        )

        ru_en = model(
            input_ids=shift_right(russian_input_ids, ru2eng),
            attention_mask=shift_right(russian_attention_mask),
            labels=shift_right(english_input_ids, pad),
            decoder_attention_mask=shift_right(shift_right(english_attention_mask)),   
        )

        loss = ru_en.loss + en_ru.loss + en_cyc_loss + ru_cyc_loss + ru_id_loss + en_id_loss

        return (loss, {"ru_en": ru_en, "en_ru": en_ru}) if return_outputs else loss


## Evaluation utils

In [38]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [39]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result


## Training

In [40]:
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir=f"./results/baseline-{model_name}",
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=max(1, 16 // batch_size),
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_steps=10,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_total_limit=1,
    seed=SEED,
    data_seed=SEED,
    #fp16=True, 
    remove_unused_columns=False
    #remove_unused_columns=False,
)

In [41]:
model = transformers.T5ForConditionalGeneration(transformers.T5Config.from_pretrained(model_name)).to(device)
model.resize_token_embeddings(len(tokenizer)) # to reshape embedings (as we added special tokens)

opt = torch.optim.AdamW(
    model.parameters(),
    lr=training_args.learning_rate, 
    weight_decay=training_args.weight_decay
)

scheduler = transformers.get_cosine_schedule_with_warmup(optimizer=opt, 
    num_warmup_steps=len(train_loader) * training_args.num_train_epochs // 5, 
    num_training_steps=len(train_loader) * training_args.num_train_epochs,
)

In [42]:
trainer = CustomTrainer(
    model=model, 
    args=training_args, 
    data_collator=collator, 
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    optimizers=(opt, scheduler),
    compute_metrics=compute_metrics,
)

In [None]:
#trainer.train(ignore_keys_for_eval=["labels_attention_mask"])
trainer.train()

Epoch,Training Loss,Validation Loss
