# Preprocess

In [1]:
!pip install pymystem3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import scipy
import nltk
import re
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

In [3]:
%load_ext autoreload
%autoreload 2

In [5]:
data = pd.read_csv('./ru_toxic_dataset.csv')
data.head()

Unnamed: 0,comment,toxic
0,дворника надо тоже уничтожить!,1.0
1,"моя старшая неделю шипела, не принимала подкид...",0.0
2,полностью с вами согласна!,0.0
3,"хоть ногу вверх, ничего не изменится",0.0
4,а что значит - левого ребенка?,0.0


In [6]:
data['toxic'] = data['toxic'].astype(int)
data = data.replace('\n',' ', regex=True)
data = data.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
data['comment'] = data['comment'].str.lower()
data = data.replace('[^а-яА-я]', ' ', regex=True)

In [7]:
data.head()

Unnamed: 0,comment,toxic
0,дворника надо тоже уничтожить,1
1,моя старшая неделю шипела не принимала подкид...,0
2,полностью с вами согласна,0
3,хоть ногу вверх ничего не изменится,0
4,а что значит левого ребенка,0


In [8]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# In this kernel I'll show you how easy it is to preprocess the text in Russian.

# You need to install two libraries:
# * nltk - to get russian stopwords
# * pymystem3 - for lemmatization

# download stopwords corpus, you need to run it once
import nltk
nltk.download("stopwords")
#--------#

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

#Create lemmatizer and stopwords list
mystem = Mystem()
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]

    text = " ".join(tokens)

    return text

#Examples
preprocess_text("Ну что сказать, я вижу кто-то наступил на грабли, Ты разочаровал меня, ты был натравлен.")
#> 'сказать видеть кто-то наступать грабли разочаровывать натравлять'

preprocess_text("По асфальту мимо цемента, Избегая зевак под аплодисменты. Обитатели спальных аррондисманов")
#> 'асфальт мимо цемент избегать зевака аплодисменты обитатель спальный аррондисман'

#Thats all :)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


'асфальт мимо цемент избегать зевака аплодисменты обитатель спальный аррондисман'

In [10]:
mystem = Mystem()


cleaned_text = []
for i in tqdm(range(len(data))):
    cleaned_text.append(preprocess_text(data["comment"][i]))


100%|██████████| 163187/163187 [02:22<00:00, 1144.88it/s]


In [11]:
cleaned_text[0]

'дворник уничтожать'

In [12]:
data['comment'] = cleaned_text

In [13]:
data.head()

Unnamed: 0,comment,toxic
0,дворник уничтожать,1
1,старший неделя шипеть принимать подкидыш котор...,0
2,полностью согласный,0
3,нога вверх ничто изменяться,0
4,значить левый ребенок,0


In [14]:
nltk.download('punkt')
data['tokenized_comment'] = data['comment'].apply(nltk.word_tokenize)
data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,comment,toxic,tokenized_comment
0,дворник уничтожать,1,"[дворник, уничтожать]"
1,старший неделя шипеть принимать подкидыш котор...,0,"[старший, неделя, шипеть, принимать, подкидыш,..."
2,полностью согласный,0,"[полностью, согласный]"
3,нога вверх ничто изменяться,0,"[нога, вверх, ничто, изменяться]"
4,значить левый ребенок,0,"[значить, левый, ребенок]"


In [15]:
data.to_csv("lemmatized.csv")

In [16]:
data['tokenized_comment'][0]

['дворник', 'уничтожать']

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the TfidfVectorizer object
vectorizer = TfidfVectorizer()

tokenized_comments = data['tokenized_comment'].apply(lambda x: ' '.join(x))
tfidf_vectors = vectorizer.fit_transform(tokenized_comments)

In [18]:
with open("vocab.txt") as f:
    vocab_bert = [r.rstrip() for r in f.readlines()]
len(vocab_bert)

100792

In [19]:
X = tfidf_vectors
y = data['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

***Get weiths of toxicity***

In [20]:
l2_coef = 0.5
alpha = 2
beta = 2

clf = LogisticRegression(C=1/(2*l2_coef), solver='sag')
clf.fit(X_train, y_train)

y_pred_val = clf.predict(X_val)
y_pred_test = clf.predict(X_test)

threshold = 0.5

print("val acc:", balanced_accuracy_score(y_val, y_pred_val))
print("test acc:", balanced_accuracy_score(y_test, y_pred_test))

val acc: 0.843433479999758
test acc: 0.839596660110323


Все слова с весами:

In [21]:
weights = clf.coef_[0]
word_weights = {word: weights[i] for word, i in vectorizer.vocabulary_.items()}
word_weight_pairs = [(word, weight) for word, weight in word_weights.items()]
df = pd.DataFrame(word_weight_pairs, columns=['word', 'weight'])
print(df.head(10))

         word    weight
0     дворник  0.071199
1  уничтожать  3.701082
2     старший -0.701989
3      неделя -0.717012
4      шипеть -0.195532
5   принимать -1.514929
6    подкидыш -0.145817
7     который  1.408997
8   приносить -0.393550
9        китя  0.000000


In [22]:
df.to_csv("weights.csv")

In [23]:
df = pd.read_csv("weights.csv")

In [24]:
df.shape

(97876, 3)

Только плохие:

In [25]:
filtered_df = df[df['weight'] > 1]
print(filtered_df.head(10))

     Unnamed: 0        word    weight
1             1  уничтожать  3.701082
7             7     который  1.408997
22           22        нога  1.335269
35           35       петух  3.182605
39           39       пацан  1.117959
80           80     сдыхать  5.047806
81           81       мразь  9.975001
122         122   модератор  1.306574
123         123   заебывать  5.784323
129         129      всякий  1.267273


In [26]:
filtered_df.shape

(935, 3)

In [27]:
filtered_df.to_csv("filtered_df.csv")

In [28]:
df_dict = {"word": vocab_bert, "weight": np.zeros(len(vocab_bert))}

In [29]:
all_df = pd.DataFrame(df_dict)

In [30]:
all_df2 = all_df.merge(df, how='outer', on="word")

In [31]:
all_df2 = all_df2.fillna(0)
all_df2['weight'] = all_df2['weight_x'] + all_df2['weight_y']

In [32]:
pos_words = df[df['weight'] <= 1]
filtered_df = df[df['weight'] > 1]


# CondBert

In [102]:
import torch
from collections import defaultdict


def group_by_first_token(texts, tokenizer):
    encoded_seqs = []
    for text in texts:
        encoded_seqs.append(tokenizer.encode(text, add_special_tokens=False))
    grouped_seqs = defaultdict(list)
    for seq in encoded_seqs:
        first_token = seq[0]
        grouped_seqs[first_token].append(seq)
    return grouped_seqs


class CondBert:
    def __init__(self, model, tokenizer, device, toxic_words, word2coef, token_toxic_weights):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.word2coef = word2coef
        self.v = dict((v, k) for k, v in tokenizer.vocab.items())
        self.toxic_weights = torch.tensor(token_toxic_weights).to(self.device)
        self.neg_complex_tokens = group_by_first_token(toxic_words, self.tokenizer)
        self.mask_index = self.tokenizer.convert_tokens_to_ids("[MASK]")

    def create_mask_fast(self, input_text, bad_words=None):
        if bad_words is None:
            bad_words = self.negative_complex_tokens

        sentences = [self.tokenizer.encode(input_text, add_special_tokens=True)]
        sentences_torch = torch.tensor(sentences)
        masks = torch.zeros_like(sentences_torch)

        for sentence_id, sentence in enumerate(sentences):
            masks = self.process_bad_words(sentence, sentence_id, masks, bad_words)

        return sentences_torch, masks


    def process_bad_words(self, sentence, sentence_id, masks, bad_words):
        for first_token_id, token in enumerate(sentence):
            for hypothesis in bad_words.get(token, []):
                if sentence[first_token_id: (first_token_id + len(hypothesis))] == hypothesis:
                    for step in range(len(hypothesis)):
                        masks[sentence_id, first_token_id + step] = 1
                    for offset, next_token in enumerate(sentence[(first_token_id + len(hypothesis)):]):
                        if self.tokenizer.convert_ids_to_tokens(next_token).startswith('##'):
                            masks[sentence_id, first_token_id + len(hypothesis) + offset] = 1
                        else:
                            break
        return masks




    def translate(self, input_sentence, toxic_weight=15, contrast_weight=0):
        tokens, attention_mask  = self.create_mask_fast(input_sentence, bad_words=self.neg_complex_tokens)
        tokens[attention_mask == 1] = self.mask_index
        tokens = tokens.to(self.device)
        self.model.eval()
        predictions = self.model(tokens, token_type_ids=torch.zeros_like(tokens))
        neg_predictions = self.model(tokens, token_type_ids=torch.ones_like(tokens) * (1 if contrast_weight else 0))
        for i in range(tokens.shape[0]):
            log_probs = predictions[-1][i][attention_mask[i] == 1]
            if toxic_weight:
                log_probs -= self.toxic_weights * toxic_weight
            if contrast_weight:
                neg_log_probs = neg_predictions[-1][i][attention_mask[i] == 1]
                scores = torch.softmax(log_probs - neg_log_probs * contrast_weight, -1)
            else:
                scores = torch.softmax(log_probs, -1)
            tokens[i, attention_mask[i] == 1] = scores.argmax(dim=1)
        output = self.tokenizer.convert_tokens_to_string([self.tokenizer.convert_ids_to_tokens(token.item()) for token in tokens[0, 1:-1]])
        return output.split('[SEP] [CLS] ')[-1]

In [None]:
!pip install transformers


In [35]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertForMaskedLM

from tqdm import tqdm


model = BertForMaskedLM.from_pretrained('DeepPavlov/rubert-base-cased-conversational')


Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

In [37]:
token_toxicities = all_df2['weight'].to_numpy()
token_toxicities = token_toxicities[:119547]
# token_toxicities = np.pad(token_toxicities, (0, 119547-all_df2.shape[0]))

In [38]:
word2coef = all_df2[['word', 'weight']].set_index('word').to_dict()
word2coef = word2coef['weight']

In [39]:
token_toxicities_e = 1/(1+np.exp(token_toxicities))

In [40]:
token_toxicities_e

array([0.5       , 0.5       , 0.5       , ..., 0.26722567, 0.50527032,
       0.51194991])

In [103]:
toxic_words = filtered_df['word']  # Example list of toxic words
input_sentence = "не дай бог моя дочь так оденется убью нахуй палкой"
model = model.to('cpu')
rewriter = CondBert(model, tokenizer, "cpu", toxic_words=toxic_words, word2coef=word2coef, token_toxic_weights=token_toxicities_e)
translated_sentence = rewriter.translate(input_sentence)
print("Input sentence:", input_sentence)
print("Translated sentence:", translated_sentence)


Input sentence: не дай бог моя дочь так оденется убью нахуй палкой
Translated sentence: не дай бог моя дочь так оденется убью чучело палкой


# Finetune

In [104]:
X = data['comment']
y = data['toxic']
# split data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y.to_numpy(dtype=np.int64), random_state=42)

In [105]:
from torch.utils.data import DataLoader, Dataset
model.train()

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=16,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": encoding["input_ids"].flatten()
        }


train_dataset = CustomDataset(X_train.to_list(), y_train, tokenizer)
val_dataset = CustomDataset(X_val.to_list(), y_val, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(val_dataset, batch_size=16)

In [None]:
from torch import nn
device = "cpu"
model = model.to(device)
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), 3e-5)
toxic_weights = torch.tensor(token_toxicities_e).to(device)
for epoch in tqdm(range(3)):
    train_loss = 0
    i = 0
    for batch in tqdm(iter(train_dataloader)):
        X = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        y = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(X, attention_mask=mask,
                        labels=y)

        idx = outputs.logits.argmax(2)
        loss = outputs.loss + (toxic_weights[idx[mask==1]] ** 2).sum()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        i += len(X)
    train_loss /= i
  #Test
    loss_test = 0
    elements = 0
    for item in iter(test_dataloader):
        X = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        y = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(X, attention_mask=mask,
                        labels=y)

        idx = outputs.logits.argmax(2)
        loss = outputs.loss + (toxic_weights[idx[mask==1]] ** 2).sum()
        loss_test += loss.item()
        elements += len(X)
    loss_test = loss_test / elements
    print("\nEpoch", epoch, "| Train loss", train_loss, "| Test loss", loss_test)


In [108]:
from google.colab import drive
drive.mount('/content/drive')

sources_path = "/content/drive/MyDrive"

Mounted at /content/drive


In [None]:
torch.save(model, sources_path + '/model.pkl')

# Testing


In [111]:
model_f = torch.load(sources_path + '/model.pkl', map_location=torch.device('cpu'))

In [114]:
input_sentence = "не дай бог моя дочь так оденется убью нахуй палкой"
model_f = model_f.to('cpu')
CondBert(model_f, tokenizer, "cpu", toxic_words=toxic_words, word2coef=word2coef, token_toxic_weights=token_toxicities_e)
translated_sentence = rewriter.translate(input_sentence)
print("Input sentence:", input_sentence)
print("Translated sentence:", translated_sentence)

Input sentence: не дай бог моя дочь так оденется убью нахуй палкой
Translated sentence: не дай бог моя дочь так оденется убью чучело палкой


In [None]:
# prepare df
test_comments1 = data[data['toxic'] == 1]
test_comments = test_comments1.reset_index()
idxs = np.random.randint(0, test_comments.shape[0], size=10000)
test_comments = test_comments.iloc[idxs]
test_comments = test_comments[~(test_comments['comment'] == '')]
test_comments.to_csv("test_comments.csv")

In [120]:
def test(model, df, out_file):
  rewriter = CondBert(model_f, tokenizer, "cpu", toxic_words=toxic_words, word2coef=word2coef, token_toxic_weights=token_toxicities_e)
  translated = []
  for i,comment in tqdm(enumerate(df['comment'])):
      try:
          translated.append(rewriter.translate(comment))
      except:
          translated.append(comment)
  df['translated'] = translated
  df.to_csv(out_file)


In [None]:
test_comments = pd.read_csv('test_comments.csv')

test(model, test_comments, 'test_comments_zero_shooting.csv')
test(model_f, test_comments, 'test_comments_finetuned.csv')

51it [00:17,  2.89it/s]