In [3]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

## Детокс руками

### Load data

In [3]:
df1 = pd.read_csv("labeled.csv")

In [4]:
list2 = []

with open("dataset.txt", "r") as file:
    for row in file:
        sep = row.find(' ')
        list2.append([row[:sep], row[sep+1:-1]])
df2 = pd.DataFrame(list2, columns = ["label", "text"])

### Drop not toxic

In [5]:
df1 = df1[df1["toxic"] == 1.0].drop("toxic", axis=1)
df1.columns = ["toxic"]

In [6]:
df2 = df2[df2["label"] != "__label__NORMAL"].drop("label", axis=1)
df2.columns = ["toxic"]

In [7]:
df = pd.concat([df1, df2], axis=0, ignore_index=True)

### Detoxify

In [9]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

base_model_name = 'sberbank-ai/ruT5-base'
model_name = 'SkolkovoInstitute/ruT5-base-detox'

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()

In [19]:
input_ids = tokenizer(["Как дела, мудила?", "Что делаешь, мудила?"], return_tensors="pt").input_ids.cuda()

In [20]:
for i in range(input_ids.shape[0]):
    print(tokenizer.decode(model.generate(input_ids)[i].cpu(), skip_special_tokens=True))

Как дела?
Что делаешь?


In [25]:
from torch.utils.data import Dataset, DataLoader

class ToxicDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return df.shape[0]
    
    def __getitem__(self, i):
        return df.iloc[i]["toxic"]
    
dataset = ToxicDataset(df)
dataloader = DataLoader(dataset, batch_size=128)

In [28]:
detoxified = []

for batch in tqdm(dataloader):
    input_ids = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512, pad_to_multiple_of=8).input_ids.cuda()
    for output_id in model.generate(input_ids).cpu():
        detoxified.append(tokenizer.decode(output_id, skip_special_tokens=True))

100%|██████████| 387/387 [12:31<00:00,  1.94s/it]


In [29]:
df["detoxified"] = detoxified

In [37]:
df.to_csv("detoxified.csv")

## Готовый датасет

In [6]:
df = pd.read_csv("train.tsv", sep='\t')

In [10]:
df = df.drop(["neutral_comment2", "neutral_comment3"], axis=1)

In [13]:
df.set_index("index", inplace=True)

In [25]:
df = df.rename(columns={"toxic_comment":"toxic", "neutral_comment1":"detoxified"})

In [27]:
df.to_csv("assessed.csv")