In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/ruT5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("sberbank-ai/ruT5-base")

df = pd.read_csv("detoxified.csv")

In [4]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class ToxicDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels, decoder_attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels
        self.decoder_attention_mask = decoder_attention_mask
        
    def __len__(self):
        return int(self.input_ids.size(dim=0))
    
    def __getitem__(self, i):
        return {"input_ids": self.input_ids[i], "attention_mask": self.attention_mask[i], "labels": self.labels[i], "decoder_attention_mask": self.decoder_attention_mask[i]}
    
train_df, eval_df = train_test_split(df, test_size=0.2)

In [5]:
prefix = tokenizer(train_df["detoxified"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512, pad_to_multiple_of=8)
train_input_ids, train_attention_mask = prefix["input_ids"], prefix["attention_mask"]
suffix = tokenizer(train_df["toxic"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
train_labels, train_decoder_attention_mask = suffix["input_ids"], suffix["attention_mask"]

prefix = tokenizer(eval_df["detoxified"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512, pad_to_multiple_of=8)
eval_input_ids, eval_attention_mask = prefix["input_ids"], prefix["attention_mask"]
suffix = tokenizer(eval_df["toxic"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
eval_labels, eval_decoder_attention_mask = suffix["input_ids"], suffix["attention_mask"]

In [6]:
train_dataset = ToxicDataset(train_input_ids, train_attention_mask, train_labels, train_decoder_attention_mask)
eval_dataset = ToxicDataset(eval_input_ids, eval_attention_mask, eval_labels, eval_decoder_attention_mask)

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [8]:
args = Seq2SeqTrainingArguments(
    output_dir = "detoxified_model",
    evaluation_strategy = "steps",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 8,
    num_train_epochs = 3,
    save_strategy = "steps",
    save_steps = 500,
    fp16 = True,
    eval_steps = 250,
    dataloader_num_workers = 4,
    group_by_length = True,
    generation_max_length=256
)

trainer = Seq2SeqTrainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer
)

Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 39544
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 1851


Step,Training Loss,Validation Loss
250,No log,0.15042
500,0.421900,0.141312
750,0.421900,0.138583
1000,0.160100,0.136297
1250,0.160100,0.135061
1500,0.153000,0.134021
1750,0.153000,0.133495


***** Running Evaluation *****
  Num examples = 9887
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 16
Saving model checkpoint to detoxified_model/checkpoint-500
Configuration saved in detoxified_model/checkpoint-500/config.json
Model weights saved in detoxified_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in detoxified_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in detoxified_model/checkpoint-500/special_tokens_map.json
Copy vocab file to detoxified_model/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 16
Saving model checkpoint to detoxified_model/checkpoint-1000
Configuration saved in detoxified_model/checkpoint-1000/config.json
Model weights saved in detoxified_model/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in detoxified_model/checkpoint-1000/tokenizer_config.

TrainOutput(global_step=1851, training_loss=0.22739396592464273, metrics={'train_runtime': 4927.1276, 'train_samples_per_second': 24.077, 'train_steps_per_second': 0.376, 'total_flos': 3384740195205120.0, 'train_loss': 0.22739396592464273, 'epoch': 3.0})

In [10]:
tokenizer.decode(model.generate(tokenizer(["Если кто не знает -- ray marching это техника рендеринга сцен, в которой лучи итеративно \"прощупывают\" сцену с помощью лишь функции расстояния до ближайшего объекта. Если алгоритм рассматривает луч в какой-то точке и знает расстояние до сцены, то это значит, что можно пройти это расстояние в направлении луча и ничего не задеть. Алгоритм проходит ровно это расстояние, и приступает к следующей итерации."], return_tensors="pt").input_ids.cuda())[0].cpu(), skip_special_tokens=True)

'если кто не знает -- ray marching это техника рендеринга сцен, то это'

In [11]:
model.save_pretrained("detoxified_release")

Configuration saved in detoxified_release/config.json
Model weights saved in detoxified_release/pytorch_model.bin


In [91]:
test_input = "на его место встал логичный Zeliboba Score"

tokenizer.decode(model.generate(tokenizer([test_input], return_tensors="pt").input_ids.cuda(),
                                do_sample=True,
                                top_k=10,
                                temperature=1.5)[0].cpu(), skip_special_tokens=True)

'на его место встал логичный eliboba Score   '

In [6]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/ruT5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("detoxified_release").cuda()

df = pd.read_csv("detoxified.csv")