In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import os
import json

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [3]:
is_cuda = False

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


##Трансформеры

####Задание 1. Изучите технологии attention и архитектуры нейронных сетей трансформеров.

In [5]:
class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    print(idx)
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten()[:512],
      'attention_mask': encoding['attention_mask'].flatten()[:512],
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [6]:
class BertClassifier:
    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=1, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)

    def preparation(self, X_train, y_train, X_valid, y_valid):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=2, shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)

    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss

    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in tqdm(self.train_loader):
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss

    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        out = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)

        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )

        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc

        self.model = torch.load(self.model_save_path)



####Задание 2. Примените один из трансформеров, например BERT к задаче классификации отзывов клиентов. Сравните полученные результаты с классическими методами машинного обучения, с RNN. Сделайте выводы.

In [7]:
sample_positive = pd.read_json('/content/drive/MyDrive/Colab Notebooks/тексты/предыдущее/sample_positive.json')
sample_negative = pd.read_json('/content/drive/MyDrive/Colab Notebooks/тексты/предыдущее/sample_negative.json')

sample_positive['grade'] = 1
sample_negative['grade'] = 0

dataframe = pd.concat([sample_positive[['text', 'grade']], sample_negative[['text', 'grade']]], ignore_index=True)
dataframe = dataframe.drop(3436, axis=0).reset_index().drop('index', axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataframe['text'].tolist(), dataframe['grade'].tolist(), train_size=0.8, random_state=42)

print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

help = BertClassifier('cointegrated/rubert-tiny', 'cointegrated/rubert-tiny', epochs=3)

help.preparation(X_train, y_train, X_test, y_test)
help.train()

1986
1639


  1%|          | 17/2083 [00:18<29:13,  1.18it/s]

3000
2077


  1%|          | 18/2083 [00:19<32:39,  1.05it/s]

2341
2019


  1%|          | 19/2083 [00:20<34:30,  1.00s/it]

624
4105


  1%|          | 20/2083 [00:21<35:35,  1.03s/it]

1820
3506


  1%|          | 21/2083 [00:23<37:45,  1.10s/it]


2444
1120


KeyboardInterrupt: ignored

In [9]:
dataframe

Unnamed: 0,text,grade
0,ВАЖНО! У кого низкий фпс открываем параметры з...,1
1,Луга - Пинаешь живой пень. Бегаешь за оленем. ...,1
2,"Какими бы не были сильными боги, от срубленног...",1
3,Игра весит 1Гб но в ней контента больше чем в ...,1
4,Самая грустная история Вальхейма.Я перевозил м...,1
...,...,...
5202,в соло будет тяжело,0
5203,мб хватит на бали чилить???,0
5204,ГОВНО,0
5205,За год толком ни чего не добавили.,0


In [None]:
# torch.save(help.model, f'/content/drive/MyDrive/Colab Notebooks/тексты/5_Transformers/content/tiny-bert-3.pt')

In [None]:
help.predict('кал зеленого гоблина который бьет меня дубиной, приятного мало, конечно')

0

In [None]:
help.predict('самая лучшая игра')

1

####Задание 3. Примените один из трансформеров, например BERT, к задаче генерации англоязычного и русскоязычного текстов. Сравните результаты с LSTM. Сделайте выводы.


In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model='gpt2')
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I\'m a language model, not a science fiction character."\n\nIn its quest to explain human language, MIT researchers used brain scans and'},
 {'generated_text': "Hello, I'm a language model, not an artificial model. One that's not based on any sort of real data or any sort of abstract idea"},
 {'generated_text': "Hello, I'm a language model, I teach a language model to my students. And there really isn't a single one of those types. I"},
 {'generated_text': "Hello, I'm a language model, a language model that is very easy to learn with. And I also think it shows us the way that languages"},
 {'generated_text': 'Hello, I\'m a language model, but I\'m also a computer science student in a university department with a PhD in science."\n\n"What'}]

In [None]:
generator = pipeline('text-generation', model='sberbank-ai/rugpt3large_based_on_gpt2')
generator("Привет, помоги мне пожалуйста все забыть", max_length=30)

[{'generated_text': 'Привет, помоги мне пожалуйста все забыть.\nПопробуй с ним поговорить, может он не хочет с тобой общаться, а ты ему нравишься.'}]

#####Training

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel # GPT2Model

# model = GPT2Model.from_pretrained('gpt2')

tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')

# text = "you know i can't help you, but here are some options for you"
text = "а ты говоришь по-русски?"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_dataset = TextDataset(tokenizer=tokenizer,
                            file_path='/content/drive/MyDrive/Colab Notebooks/тексты/dostoevsky.txt',
                            block_size=64)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)




In [None]:
# pip install accelerate -U

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/тексты/5_Transformers/gpt",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=0,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=0, # to make "virtual" batch size larger
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [None]:
# trainer.train()

In [None]:
# generated_text = model.generate(
#     input_ids=encoded_input['input_ids'],
#     max_length=100,
#     temperature=1.2,
#     do_sample=True,
#     top_k=3,
#     top_p=0.95
# )
# print(f"дано: {text}")
# # print(generated_text)

# prediction = tokenizer.decode(generated_text[0])
# print(prediction)

дано: а ты говоришь по-русски?
а ты говоришь по-русски?
— Да.
— А что?
— А то, что у тебя в руках — не книга. Это не учебник.

— Ну и что?
— А то, что я не хочу читать эту гадость. И не буду.

— Почему?
— Потому что это — мерзость.

— Почему мерзость?
— Потому что это — мерзость.<s>
Оригинал взят у в postВ Москве


In [None]:
# text = "анна не могла перестать трястись"
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

# generated_text = model.generate(
#     input_ids=encoded_input['input_ids'],
#     max_length=100,
#     temperature=1.2,
#     do_sample=True,
#     top_k=3,
#     top_p=0.95
# )
# print(f"дано: {text}")
# # print(generated_text)

# prediction = tokenizer.decode(generated_text[0])
# print(prediction)

дано: анна не могла перестать трястись
анна не могла перестать трястись от страха и не могла понять, что происходит.

– Что случилось? – спросила она, когда наконец поняла, что происходит.

– Не знаю… Я не знаю, что происходит.

– Я не могу понять… Я не могу понять, почему… – Она снова замолчала и уставилась на свои руки.

– Я не понимаю… – снова повторила она.

– Что происходит? – повторила


In [None]:
# text = "анна не могла перестать трястись"
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

# generated_text = model.generate(
#     input_ids=encoded_input['input_ids'],
#     max_length=500,
#     temperature=1.8,
#     do_sample=True,
#     top_k=30,
#     top_p=0.95
# )
# print(f"дано: {text}")
# # print(generated_text)

# prediction = tokenizer.decode(generated_text[0])
# print(prediction)

дано: анна не могла перестать трястись
анна не могла перестать трястись и в изнеможении падать вниз головой на землю. Ей не хотелось плакать или жаловаться, потому что больше плакать не осталось ни слез, ни слов. Ее глаза смотрели куда-то сквозь нее, в какой-то другой миг она могла даже сказать — никуда, ибо была в полном одиночестве. Когда боль от падения постепенно усилилась, боль, пронзавшая все мышцы, стала такой острой (это была всего только жалящая боль), что от желания закричать или закричать громко, без всяких предварительных приготовлений, ее спасал инстинкт самосохранения. И в тот же момент это чувство подсказало ей еще больше, еще больше о многом предупредить и что было сил остановить свои действия: не падать вниз (или даже наоборот — упасть не так, как упали), а лететь высоко-высоко к своему дому, к отцу, к сестре (хотя и это ей было не под силу: ноги все подкашивались, каждая косточка во всем теле ныла), — к сестре! И только одна единственная мысль стучилась в нее — «Я вер

In [None]:
# en_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# en_model = GPT2LMHeadModel.from_pretrained('gpt2')

# text = "help me, i can't stop"
# encoded_input = en_tokenizer(text, return_tensors='pt')
# output = en_model(**encoded_input)

# generated_text = en_model.generate(
#     input_ids=encoded_input['input_ids'],
#     max_length=300,
#     temperature=1.8,
#     do_sample=True,
#     top_k=30,
#     top_p=0.95
# )
# print(f"дано: {text}\n")
# # print(generated_text)

# prediction = en_tokenizer.decode(generated_text[0])
# print(prediction)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


дано: help me, i can't stop

help me, i can't stop smiling at you. Please let me sleep at your hotel room tomorrow for two days. I'm looking forward to you knowing me too good. Thanks!"

"Hi!"

"No thankless thanks for letting you sleep!" shouted Neeb.

They started laughing a little but Nelly looked a little disappointed at what she saw from his reaction too

"No… neeb, what was the most difficult step to taking, your new boyfriend?" asked Yap.

Nyey sighed

"Yeah, the most impossible steps! Oh my.."

The couple exchanged several glances with one another who smiled at all those smiling but no one saw or said anything about Nelly. He just looked sad and embarrassed

"Uh… uh.. I wonder..." said Niey. He was happy now and looked satisfied at the happy expression on the woman and even laughed his heart off too. Nynifu took out two big muffs from a metal box from Neeb's hand like that before throwing some money into his pocket and started working out as Yap called by talking the first ques

In [None]:
"""
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = 'gpt2'  # or 'gpt2-medium', 'gpt2-large', 'gpt2-xl' for larger models
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model.eval()

input_text = "Your input text goes here"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(input_ids, max_length=100, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

"""

####Задание 4. Примените один из трансформеров, например BERT, к задаче машинного перевода.


In [None]:
from transformers import pipeline

translator = pipeline("translation", model="t5-small",
                      src_lang='en', tgt_lang='de')
translator("i really need help")

[{'translation_text': 'Ich brauche wirklich Hilfe'}]

#####Automated training loop

In [None]:
pip install sentencepiece



In [None]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



[{'translation_text': 'Par défaut pour les threads élargis'}]

In [None]:
pip install datasets

In [None]:
pip install sacrebleu

In [None]:
pip install evaluate

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset('kde4', lang1='en', lang2='fr')
split_datasets = raw_dataset['train'].train_test_split(train_size=0.9, seed=20)
split_datasets['validation'] = split_datasets.pop('test')

In [None]:
split_datasets["train"][1]

{'id': '152754',
 'translation': {'en': 'Default to expanded threads',
  'fr': 'Par défaut, développer les fils de discussion'}}

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



In [None]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets,
                             max_length=max_length, truncation=True)
    return model_inputs

tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [None]:
import evaluate
import numpy as np

metric = evaluate.load('sacrebleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(max_length=max_length)

#####Custom training loop

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset('kde4', lang1='en', lang2='fr')
split_datasets = raw_dataset['train'].train_test_split(train_size=0.9, seed=20)
split_datasets['validation'] = split_datasets.pop('test')

In [None]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets,
                             max_length=max_length, truncation=True)
    return model_inputs

tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

In [None]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(tokenized_datasets["validation"],
                             collate_fn=data_collator, batch_size=8)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamW

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [None]:
from tqdm.auto import tqdm
import torch

output_dir = "/content/drive/MyDrive/Colab Notebooks/тексты/5_Transformers/trans"
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)