In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/ruT5-large")

model = AutoModelForSeq2SeqLM.from_pretrained("sberbank-ai/ruT5-large")

df = pd.read_csv("detoxified.csv")

In [4]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class ToxicDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels, decoder_attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels
        self.decoder_attention_mask = decoder_attention_mask
        
    def __len__(self):
        return int(self.input_ids.size(dim=0))
    
    def __getitem__(self, i):
        return {"input_ids": self.input_ids[i], "attention_mask": self.attention_mask[i], "labels": self.labels[i], "decoder_attention_mask": self.decoder_attention_mask[i]}
    
train_df, eval_df = train_test_split(df, test_size=0.2)

In [5]:
prefix = tokenizer(train_df["detoxified"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512, pad_to_multiple_of=8)
train_input_ids, train_attention_mask = prefix["input_ids"], prefix["attention_mask"]
suffix = tokenizer(train_df["toxic"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
train_labels, train_decoder_attention_mask = suffix["input_ids"], suffix["attention_mask"]

prefix = tokenizer(eval_df["detoxified"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512, pad_to_multiple_of=8)
eval_input_ids, eval_attention_mask = prefix["input_ids"], prefix["attention_mask"]
suffix = tokenizer(eval_df["toxic"].values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
eval_labels, eval_decoder_attention_mask = suffix["input_ids"], suffix["attention_mask"]

In [6]:
train_dataset = ToxicDataset(train_input_ids, train_attention_mask, train_labels, train_decoder_attention_mask)
eval_dataset = ToxicDataset(eval_input_ids, eval_attention_mask, eval_labels, eval_decoder_attention_mask)

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [8]:
args = Seq2SeqTrainingArguments(
    output_dir = "detoxified_model_large",
    evaluation_strategy = "steps",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 8,
    num_train_epochs = 2,
    save_strategy = "steps",
    save_steps = 500,
    fp16 = True,
    eval_steps = 250,
    dataloader_num_workers = 4,
    group_by_length = True,
    generation_max_length=256
)

trainer = Seq2SeqTrainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer
)

Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 39544
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 2470


Step,Training Loss,Validation Loss
250,No log,0.141311
500,0.302700,0.135826
750,0.302700,0.13251
1000,0.142700,0.130715
1250,0.142700,0.129663
1500,0.138600,0.128569
1750,0.138600,0.127743
2000,0.135600,0.127186
2250,0.135600,0.126916


***** Running Evaluation *****
  Num examples = 9887
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 8
Saving model checkpoint to detoxified_model_large/checkpoint-500
Configuration saved in detoxified_model_large/checkpoint-500/config.json
Model weights saved in detoxified_model_large/checkpoint-500/pytorch_model.bin
tokenizer config file saved in detoxified_model_large/checkpoint-500/tokenizer_config.json
Special tokens file saved in detoxified_model_large/checkpoint-500/special_tokens_map.json
Copy vocab file to detoxified_model_large/checkpoint-500/spiece.model
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9887
  Batch size = 8
Saving model checkpoint to detoxified_model_large/checkpoint-1000
Configuration saved in detoxified_model_large/checkpoint-1000/config.json
Model weights saved in detoxified_model_large/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in 

TrainOutput(global_step=2470, training_loss=0.1708074206765364, metrics={'train_runtime': 9932.5785, 'train_samples_per_second': 7.962, 'train_steps_per_second': 0.249, 'total_flos': 8023935909888000.0, 'train_loss': 0.1708074206765364, 'epoch': 2.0})

In [10]:
tokenizer.decode(model.generate(tokenizer(["Если кто не знает -- ray marching это техника рендеринга сцен, в которой лучи итеративно \"прощупывают\" сцену с помощью лишь функции расстояния до ближайшего объекта. Если алгоритм рассматривает луч в какой-то точке и знает расстояние до сцены, то это значит, что можно пройти это расстояние в направлении луча и ничего не задеть. Алгоритм проходит ровно это расстояние, и приступает к следующей итерации."], return_tensors="pt").input_ids.cuda())[0].cpu(), skip_special_tokens=True)

'если кто не знает -- ray marching это техника рендеринга сцен, в которой'

In [11]:
model.save_pretrained("detoxified_release_large")

Configuration saved in detoxified_release_large/config.json
Model weights saved in detoxified_release_large/pytorch_model.bin


In [16]:
test_input = "Хороший человек"

tokenizer.decode(model.generate(tokenizer([test_input], return_tensors="pt").input_ids.cuda(),
                                min_length=len(test_input) + 5,
                                max_length=512,
                                num_beams=5,
                                repetition_penalty=2.0)[0].cpu(), skip_special_tokens=True)

'пидор конченый!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'

In [21]:
special = ["!", "+", "d", "(", ")"]
for sp in special:
    print(sp, train_df.apply(lambda row: row['toxic'].count(sp) - row['detoxified'].count(sp), axis=1).mean())

! 0.4872294153348169
+ 0.011961359498280397
d 0.03302650212421606
( 0.029916042888933847
) 0.07037730123406838


In [None]:
train_df.apply(lambda row: row['toxic'].count('!') - row['detoxified'].count('!'), axis=1).mean()

In [19]:
train_df.columns

Index(['Unnamed: 0', 'toxic', 'detoxified'], dtype='object')