In [4]:
# !pip install transformers
# !pip install sentence_transformers

In [5]:
from tqdm import tqdm
from urllib.request import urlretrieve
import zipfile
import os
import pandas as pd
import warnings
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
import ast
import re
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as Fun
import torch.nn as nn
import torch
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer, util
from IPython.display import clear_output

torch.manual_seed(420)

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Classifier

Building binary classifier for toxic/non-toxic sentences, based on preprocessed raw dataset

In [6]:
directory_binary_prep = "./data/interm"
filename_binary_prep = directory_binary_prep + "/toxic_binary_preprocessed.csv"

train_bin_prep = pd.read_csv(filename_binary_prep, index_col = 0)
train_bin_prep

Unnamed: 0,sentence,label
0,"['if', 'alkar', 'is', 'flooding', 'her', 'with...",0
1,"['now', 'you', 're', 'getting', 'nasty']",0
2,"['well', 'we', 'could', 'spare', 'your', 'life...",0
3,"['ah', 'monkey', 'you', 've', 'got', 'to', 'sn...",0
4,"['i', 've', 'got', 'orders', 'to', 'put', 'her...",0
...,...,...
1155549,"['you', 'didn', 't', 'know', 'that', 'estelle'...",1
1155550,"['you', 'd', 'be', 'sucked', 'out', 'of', 'you...",0
1155551,"['i', 'really', 'can', 't', 'take', 'this']",0
1155552,"['they', 'said', 'i', 'was', 'a', 'hero', 'but...",0


In [7]:
# Train/Val split
ratio = 0.2
train_classif, val_classif = train_test_split(
    train_bin_prep, stratify=train_bin_prep['label'], test_size=0.2,
    random_state=420
)

In [8]:
# Generating vocab
def yield_tokens(df):
    for _, sample in df.iterrows():
        yield sample.to_list()[0]


# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = build_vocab_from_iterator(yield_tokens(train_classif), specials=special_symbols)
vocab.set_default_index(UNK_IDX)

In [9]:
# Create dataloaders
torch.manual_seed(420)

text_pipeline = lambda x: vocab(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(torch.tensor(text_pipeline(ast.literal_eval(_text)), dtype=torch.int64))

    text_list = pad_sequence(text_list, batch_first=True, padding_value=1)
    return torch.tensor(label_list, dtype=torch.long).to(device), torch.tensor(text_list, dtype=torch.long).to(device), torch.tensor(offsets).to(device)

train_dataloader = DataLoader(
    train_classif.to_numpy(), batch_size=128, shuffle=True, collate_fn=collate_batch
)

val_dataloader = DataLoader(
    val_classif.to_numpy(), batch_size=128, shuffle=False, collate_fn=collate_batch
)

In [10]:
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        num_words = len(vocab.get_itos())
        embed_dim = 1024
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embed_dim)
        self.dropout = nn.Dropout(p=0.6)

        self.lstm = nn.LSTM(embed_dim, 128, bidirectional=True, batch_first=True, num_layers=2, dropout=0.5)
        # output layer is a layer which has only one output
        # input(512) = 128+128 for mean and same for max pooling
        self.out = nn.Sequential(
            nn.Linear(512, num_classes),
            nn.Softmax()
        )

    def forward(self, text):
        x = self.embedding(text)
        x = self.dropout(x)
        # move the embedding output to lstm
        x,_ = self.lstm(x)
        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(x,1)
        max_pool, _ = torch.max(x,1)
        # concatenate mean and max pooling this is why 512
        # 128 for each direction = 256
        # avg_pool = 256, max_pool = 256
        out = torch.cat((avg_pool,max_pool), 1)
        # pass through the output layer and return the output
        out = self.out(out)
        return out

In [11]:
def train_one_epoch_classif(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    for i, batch in loop:
        labels, texts, offsets = batch
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(texts)
        loss = loss_fn(outputs, labels)

        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i * len(labels))})

def val_one_epoch_classif(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):

    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            labels, texts, offsets = batch

            # forward pass
            outputs = model(texts)
            # loss calculation
            loss = loss_fn(outputs, labels)

            _, predicted = outputs.data.max(1, keepdim=True)
            total += labels.size(0)
            correct += predicted.eq(labels.data.view_as(predicted)).sum()

            val_loss += loss
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})

        if correct / total > best_so_far:
            torch.save(model.state_dict(), ckpt_path)
            return correct / total

    return best_so_far

In [13]:
# Collate batch for test data
def collate_batch_test(batch):
    text_list, offsets = [], [0]
    for _text in batch:
        print(_text[0])
        text_list.append(torch.tensor(text_pipeline(_text[0]), dtype=torch.int64))

    text_list = pad_sequence(text_list, batch_first=True, padding_value=1)
    return torch.tensor(text_list, dtype=torch.long).to(device), torch.tensor(offsets).to(device)

In [14]:
def to_test_df(sentence):
    return_df = pd.DataFrame([], columns=['sentence'])
    for i in range(len(sentence)):
        tokenized = word_tokenize(sentence[i].lower())
        tmp = []
        for word in tokenized:
            result = re.match('^[\W]*$', word)
            if result is None:
                tmp.append(word)
        return_df = return_df.append({"sentence": tmp},
                                            ignore_index = True)
    return return_df

def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc="Predictions:",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, offsets = batch

            # forward pass and loss calculation
            outputs = model(texts)

            _, predicted = torch.max(outputs.data, 1)
            predictions += predicted.detach().cpu().tolist()

    return predictions

In [15]:
epochs = 3
model = TextClassificationModel(len(train_bin_prep['label'].unique())).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
loss_fn = nn.CrossEntropyLoss()

In [16]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch_classif(model, train_dataloader, optimizer, loss_fn, epoch_num = epoch)
    best = val_one_epoch_classif(model, val_dataloader, loss_fn, epoch, best_so_far=best, ckpt_path = "best_classifier.pt")
    print(best)

Epoch 0: train: 100%|██████████| 7223/7223 [03:47<00:00, 31.77it/s, loss=0.0256]
Epoch 0: val: 100%|██████████| 1806/1806 [00:35<00:00, 51.53it/s, loss=tensor(0.0054, device='cuda:0'), acc=tensor(0.5162, device='cuda:0')]


tensor(0.5162, device='cuda:0')


Epoch 1: train: 100%|██████████| 7223/7223 [03:46<00:00, 31.92it/s, loss=0.0256]
Epoch 1: val: 100%|██████████| 1806/1806 [00:36<00:00, 50.06it/s, loss=tensor(0.0054, device='cuda:0'), acc=tensor(0.5142, device='cuda:0')]


tensor(0.5162, device='cuda:0')


Epoch 2: train: 100%|██████████| 7223/7223 [03:46<00:00, 31.91it/s, loss=0.0256]
Epoch 2: val: 100%|██████████| 1806/1806 [00:35<00:00, 50.69it/s, loss=tensor(0.0054, device='cuda:0'), acc=tensor(0.5150, device='cuda:0')]

tensor(0.5162, device='cuda:0')





In [17]:
# Loading pretrained model from file
ckpt = torch.load("best_classifier.pt")
model.load_state_dict(ckpt)

<All keys matched successfully>

In [18]:
# Testing model on some arbitary sentences
test_sentence = ["I'm fucking hate you, stupid idiot!",
                 "Shut up, bitch",
                 "This is a bullshit",
                 "Hello"]
test = to_test_df(test_sentence)
test_dataloader = DataLoader(
    test.to_numpy(), batch_size=1, shuffle=True, collate_fn=collate_batch_test
)
predictions = predict(model, test_dataloader)
predictions

Predictions:: 100%|██████████| 4/4 [00:00<00:00, 243.59it/s]

['hello']
['this', 'is', 'a', 'bullshit']
['shut', 'up', 'bitch']
['i', "'m", 'fucking', 'hate', 'you', 'stupid', 'idiot']





[0, 0, 0, 0]

This model showed not very good performance on classifying sentence as toxic/non-toxic

# Masker

In [19]:
# Dataset for toxic words masking
directory_toxic_span_compress = "./data/interm"
filename_toxic_span_compress = directory_toxic_span_compress + "/toxic_span_compressed.csv"

train_mask = pd.read_csv(filename_toxic_span_compress, index_col = 0)
train_mask

Unnamed: 0,sentence_id,tokens,toxic?
0,0,another,False
1,0,violent,True
2,0,and,True
3,0,aggressive,True
4,0,immigrant,True
...,...,...,...
272441,7938,out,False
272442,7938,of,False
272443,7938,women,True
272444,7938,'s,True


In [20]:
# Train/Val split
ratio = 0.2
train_split, val_split = train_test_split(range(train_mask['sentence_id'].max()),
                                          test_size=ratio, random_state=420)

In [21]:
train_mask_dataframe = train_mask[train_mask['sentence_id'].isin(train_split)]
val_mask_dataframe = train_mask[train_mask['sentence_id'].isin(val_split)]

In [22]:
class ToxicSpanDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()
        self._postprocess()

    def _preprocess(self):

        self.dataframe['tokens'] = self.dataframe['tokens'].apply(str.lower)
        self.sentences = [self._get_sentence(idx) for idx in list(self.dataframe['sentence_id'].unique())]
        if 'toxic?' in self.dataframe.columns:
            self.tags = [self._get_labels(idx) for idx in list(self.dataframe['sentence_id'].unique())]

    def _postprocess(self):
        for i, sentence in enumerate(self.sentences):
            self.sentences[i] = self.vocab(sentence)

    def _create_vocab(self):
        vocab = build_vocab_from_iterator(self.sentences,
                                          specials=special_symbols)
        vocab.set_default_index(UNK_IDX)
        return vocab

    def _get_sentence(self, index: int) -> list:
        sent = list(self.dataframe.loc[self.dataframe['sentence_id'] == index]['tokens'])
        return sent

    def _get_labels(self, index: int) -> list:
        tags = list(self.dataframe.loc[self.dataframe['sentence_id'] == index]['toxic?'])
        tags = [int(tag) for tag in tags]
        return tags

    def __getitem__(self, index) -> tuple[list, list]:
        if 'toxic?' in self.dataframe.columns:
            return (self.sentences[index], self.tags[index])
        else:
            return self.sentences[index]

    def __len__(self) -> int:
        return len(self.sentences)

In [23]:
# Create train and val datasets
train_dataset = ToxicSpanDataset(train_mask_dataframe)
val_dataset = ToxicSpanDataset(val_mask_dataframe, vocab=train_dataset.vocab)

In [24]:
# Creating dataloaders
batch_size = 128
max_size = 50

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    sentences_batch, postags_batch = [], []
    for _sent, _postags in batch:
        sentences_batch.append(torch.tensor(_sent, dtype=torch.int64))
        postags_batch.append(torch.tensor(_postags, dtype=torch.int64))

    sentences_batch = pad_sequence(sentences_batch, batch_first=True, padding_value=1).T
    postags_batch = pad_sequence(postags_batch, batch_first=True, padding_value=0).T
    postags_batch = torch.unsqueeze(postags_batch, 2)

    return torch.tensor(sentences_batch, dtype=torch.long).to(device), torch.tensor(postags_batch, dtype=torch.long).to(device)

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
val_dataloader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

In [25]:
for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

torch.Size([179, 128])
torch.Size([179, 128, 1])


In [26]:
# Main model
class ToxicTagger(nn.Module):
    def __init__(self, in_dim, out_dim):

        super().__init__()
        num_words = len(train_dataset.vocab.get_itos())
        embed_dim = 1024
        h_layer = 128
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embed_dim)
        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embed_dim, h_layer, bidirectional=False, batch_first=True, num_layers=2, dropout=0.5)
        self.out = nn.Sequential(
            nn.Linear(h_layer, 12)
        )

    def forward(self, text):

        x = self.embedding(text)
        x = self.dropout(x)
        x,_ = self.lstm(x)
        output = self.out(x)
        return output

In [27]:
def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch

        optimizer.zero_grad()
        outputs = model(texts)
        output_dim = outputs.shape[-1]

        outputs = outputs[1:].view(-1, output_dim)
        labels = labels[1:].reshape(-1)
        loss = loss_fn(outputs, labels)

        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i * len(labels))})


def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):

    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch

            outputs = model(texts)
            output_dim = outputs.shape[-1]

            outputs = outputs[1:].view(-1, output_dim)
            labels = labels[1:].reshape(-1)
            loss = loss_fn(outputs, labels)

            _, predicted = outputs.data.max(1, keepdim=True)
            total += labels.size(0)
            correct += predicted.eq(labels.data.view_as(predicted)).sum()

            val_loss += loss
            loop.set_postfix({"loss": val_loss/(total), "acc": correct / (total)})

        if correct / total > best_so_far:
            torch.save(model.state_dict(), ckpt_path)
            return correct / (total)

    return best_so_far

In [28]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Predictions",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, length = batch
            outputs = model(texts)

            _, predicted = torch.max(outputs.data, 2)
            predicted = predicted.T
            predicted = [np.array(torch.tensor(predicted[i,:length[i]], device='cpu')) for i, pred in enumerate(predicted.detach().cpu().tolist())]

            predictions += predicted

    return predictions

In [29]:
def to_test_df(sentence):
    return_df = pd.DataFrame([], columns=['sentence_id', 'tokens'])
    for i in range(len(sentence)):
        tokenized = word_tokenize(sentence[i])
        for word in tokenized:
            result = re.match('^[\W]*$', word)
            if result is None:
                return_df = return_df.append({"sentence_id": i,
                                            "tokens": word.lower()},
                                            ignore_index = True)
    return return_df

In [None]:
# For test data
batch_size = 1
def collate_batch_test(batch: list):
    sentences_batch, sentences_lengths = [], []
    for _sent in batch:
        sentences_batch.append(torch.tensor(_sent, dtype=torch.int64))
        sentences_lengths.append(len(_sent))

    sentences_batch = pad_sequence(sentences_batch, batch_first=True, padding_value=1).T
    return torch.tensor(sentences_batch, dtype=torch.long).to(device), sentences_lengths

In [30]:
INPUT_DIM = len(train_dataset.vocab)
OUTPUT_DIM = 2

model = ToxicTagger(INPUT_DIM, OUTPUT_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr = 2e-3)
loss_fn = nn.CrossEntropyLoss()

In [31]:
best = -float('inf')
num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn,
                    epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch,
                         best_so_far=best, ckpt_path = "best_toxic_span.pt")

Epoch 0: train: 100%|██████████| 47/47 [00:02<00:00, 19.68it/s, loss=1.92e-5]
Epoch 0: val: 100%|██████████| 12/12 [00:00<00:00, 50.70it/s, loss=tensor(3.2722e-06, device='cuda:0'), acc=tensor(0.9845, device='cuda:0')]
Epoch 1: train: 100%|██████████| 47/47 [00:02<00:00, 20.36it/s, loss=7.13e-6]
Epoch 1: val: 100%|██████████| 12/12 [00:00<00:00, 48.67it/s, loss=tensor(2.3096e-06, device='cuda:0'), acc=tensor(0.9845, device='cuda:0')]
Epoch 2: train: 100%|██████████| 47/47 [00:02<00:00, 19.96it/s, loss=4.74e-6]
Epoch 2: val: 100%|██████████| 12/12 [00:00<00:00, 40.74it/s, loss=tensor(2.0703e-06, device='cuda:0'), acc=tensor(0.9869, device='cuda:0')]
Epoch 3: train: 100%|██████████| 47/47 [00:02<00:00, 19.50it/s, loss=8.15e-6]
Epoch 3: val: 100%|██████████| 12/12 [00:00<00:00, 49.71it/s, loss=tensor(1.9784e-06, device='cuda:0'), acc=tensor(0.9877, device='cuda:0')]
Epoch 4: train: 100%|██████████| 47/47 [00:02<00:00, 19.73it/s, loss=4.21e-6]
Epoch 4: val: 100%|██████████| 12/12 [00:00<00

In [33]:
ckpt = torch.load("best_toxic_span.pt")
model.load_state_dict(ckpt)

<All keys matched successfully>

In [None]:
# Testing model on some arbitary sentences
test_sentence = ["I'm fucking hate you, stupid idiot!",
                 "Shut up, bitch",
                 "This is a bullshit"]
test = to_test_df(test_sentence)
test_dataset = ToxicSpanDataset(test, vocab=train_dataset.vocab)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch_test)
predictions = predict(model, test_dataloader)
predictions

Model showed relatively good performance

# Zero-shot classifier

In [35]:
# Using pre-trained zero-shot classifier
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [36]:
labels = ["toxic", "non-toxic"]
hypothesis_template = 'This text is {}.'
sequence = "Shut up, bitch"

prediction = classifier(sequence, labels, hypothesis_template=hypothesis_template, multi_class=True)

print(prediction)

The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


{'sequence': 'Shut up, bitch', 'labels': ['toxic', 'non-toxic'], 'scores': [0.9497930407524109, 0.006907099857926369]}


# Full model

In [61]:
class ToxicParafraser():
    def __init__(self, span_model, classifier, model = "distilbert-base-uncased",
                 sent_model = 'distilbert-base-nli-mean-tokens'):
        super().__init__()
        self.labels_ = ["toxic", "non-toxic"]
        self.hypothesis_template_ = 'This text is {}.'
        self.classifier = classifier
        self.span_model = span_model
        self.nlp_model = AutoModelForMaskedLM.from_pretrained(model)
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.sent_tranf_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

    def masker(self, sentence, mask, tag = "[MASK]") -> tuple:
        """
        Using sentence and its toxic mask to hide each 
        toxic word separately and all words together
        
        :param sentence: toxic sentence
        :param mask: mask for toxic words
        :return: (list with separately masked words, string with all masked words)
        """
        if sum(mask) == 0:
            return sentence, ""
        else:
            sentence_result = {}
            full_sent = ""
            full_mask = ""
            tokenized = word_tokenize(sentence)
            i = 0
            for word in tokenized:
                result = re.match('^[\W]*$', word)
                if result is None:
                    # For toxic tokens
                    if mask[i]:
                        # Add new sentence with mask instead of toxic word
                        for k in sentence_result.keys():
                            sentence_result[k] += " "*(i!=0) + word
                        sentence_result[i] = full_sent + " "*(i!=0) + tag
                        full_sent += " "*(i!=0) + word
                        full_mask += " "*(i!=0) + tag
                    else:
                        # Adding word or punktuational structure
                        if not (re.match('^[\W]', word) is None):
                            add = word
                        else:
                            add = " "*(i!=0) + word

                        for k in sentence_result.keys():
                            sentence_result[k] += add

                        full_sent += add
                        full_mask += add
                    i += 1
                else:
                    for k in sentence_result.keys():
                        sentence_result[k] += word
                    full_sent += word
                    full_mask += word

            return sentence_result, full_mask

    def span_predict(self, text):
        # Predict toxic mask for text
        text = to_test_df([text])
        text_dataset = ToxicSpanDataset(text, vocab=train_dataset.vocab)
        text_dataloader = torch.utils.data.DataLoader(dataset=text_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False,
                                                      collate_fn=collate_batch)
        predictions = predict(self.span_model, text_dataloader)
        return predictions

    def replacement(self, text, words, tag = "[MASK]"):
        text_word = [text]
        best_text = ""
        best_sim = 0

        # Check each word
        for word in words:
            replaced_text = text.replace(tag, word)
            text_word.append(replaced_text)

        text_embeddings = self.sent_tranf_model.encode(text_word)

        # Choode word, which produce most similar sentence to initial
        for i in range(1, len(text_embeddings)):
            sim = util.pytorch_cos_sim(text_embeddings[0], text_embeddings[i])
            if sim > best_sim:
                best_sim = sim
                best_text = i - 1

        return words[best_text]

    def forward(self, text, tag = "[MASK]"):
        # Predict binary class: toxic/non-toxic
        prediction = self.classifier(text,
                                     self.labels_,
                                     hypothesis_template=self.hypothesis_template_,
                                     multi_label=False)
        # Skip non-toxic
        if prediction["labels"][np.argmax(prediction["scores"])] == "non-toxic":
            return text

        # Toxic mask for toxic sentences
        mask = self.span_predict(text)
        masked_text, full_mask = self.masker(text, mask[0])

        if full_mask == "":
            return text
        
        # For each toxic word in sentence
        for k in masked_text.keys():
            # Produce several candidates
            inputs = self.tokenizer(masked_text[k], return_tensors="pt")
            token_logits = self.nlp_model(**inputs).logits
            mask_token_index = torch.where(inputs["input_ids"] == self.tokenizer.mask_token_id)[1]

            mask_token_logits = token_logits[0, mask_token_index, :]

            candidates = torch.sort(mask_token_logits, dim=1, descending=True).indices[0].tolist()

            # Find first 5 candidates, which are not toxic!
            cand = 0
            resulted_arr = []
            for i, token in enumerate(candidates):
                if re.match('^[\W]*$', self.tokenizer.decode([token])) is not None:
                    continue

                replaced_text = masked_text[k].replace(self.tokenizer.mask_token,
                                                       self.tokenizer.decode([token]))
                replaced_mask = self.span_predict(replaced_text)
                clear_output()

                if replaced_mask[0][k] != 1:
                    resulted_arr.append(self.tokenizer.decode([token]))
                if len(resulted_arr) == 5:
                    break

            # Choose best (most similar) replacement
            best_word = self.replacement(masked_text[k], resulted_arr)
            full_mask = full_mask.replace(tag, best_word, 1)

        return full_mask

In [62]:
tx_par = ToxicParafraser(model, classifier)

# Evaluation

In [63]:
test_sentence = ["I fucking hate you, stupid idiot!",
                 "fucking shut up, bitch",
                 "Hello, my friend!"]

output = ""
for i in range(len(test_sentence)):
    sent = test_sentence[i]
    result = tx_par.forward(sent)
    output += f'\nS{i}: {sent}\nS{i}: {result}\n'

print(output)


S0: I fucking hate you, stupid idiot!
S0: I really hate you, you dude!

S1: fucking shut up, bitch
S1: oh shut up, goddamn

S2: Hello, my friend!
S2: Hello, my friend!



In [64]:
directory_binary = "./data/interm"
filename_binary = directory_binary + "/toxic_binary.csv"

train_bin = pd.read_csv(filename_binary, index_col = 0)
train_bin

Unnamed: 0,sentence,label
0,"If Alkar is flooding her with psychic waste, t...",0
1,Now you're getting nasty.,0
2,"Well, we could spare your life, for one.",0
3,"Ah! Monkey, you've got to snap out of it.",0
4,I've got orders to put her down.,0
...,...,...
1155549,you didn't know that Estelle stole your fish f...,1
1155550,you'd be sucked out of your life!,0
1155551,I really can't take this.,0
1155552,"they said I was a hero, but I didn't care.",0


In [67]:
sent_tranf_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [86]:
# Check how much toxic sentence become non-toxic after applying model
final_sim = 0
final_tox = 0
initial_tox = 0
size = 1000
for i in tqdm(train_bin.index[:size]):
    if train_bin.iloc[i]['label']:
        sent = train_bin.iloc[i]['sentence']

        initial_tox += 1

        peref = tx_par.forward(sent)
        classified = classifier(peref, labels,
                                hypothesis_template=hypothesis_template)

        final_tox += classified['labels'][np.argmax(classified['scores'])] == 'toxic'
        text_embeddings = sent_tranf_model.encode([sent, peref])
        final_sim += int(util.pytorch_cos_sim(text_embeddings[0], text_embeddings[1])[0][0])

100%|██████████| 1000/1000 [21:56<00:00,  1.32s/it]


In [88]:
# Number of toxic sentences in dataset around 35% lower 
print(f'Lower toxicity on {1 - final_tox/initial_tox}')

Lower toxicity on 0.3453887884267631


In [87]:
print(f'Initial toxicity: {initial_tox/size}\nFinal toxicity: {final_tox/size}\nFinal similarity: {final_sim/size}')

Initial toxicity: 0.553
Final toxicity: 0.362
Final similarity: 0.144
