# Text De-Toxification, part II: Building Vocabulary for condBERT
### Robert Chen, B20-AI
--------------------

## Step 0: Imports

In [11]:
import pandas as pd
import pickle
import numpy as np
from nltk import ngrams
from nltk.tokenize import WordPunctTokenizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import BertTokenizer

## Step 1: Download the datasets

The **condBERT** model does not need to be trained, but we need a solid corpus for it to show acceptable results. Gladly, there are already a lot of datasets that suit this task specifically. In our case, we are going to use *Jigsaw* dataset, which was already prepared by SkolTech team and the initial *ParaNMT* dataset. 

In [12]:
#!usr/bin/bash
DATA_DIR="../data"
! bash $DATA_DIR/clean_vocab.sh
! bash $DATA_DIR/download_jigsaw.sh
! bash $DATA_DIR/download_paranmt.sh

/home/b0b/Stuff/Personal/Uni/InnoStuff/F23/Practical Machine Learning/PMLDL-text-detox/data/interim/condbert_vocab/train
/home/b0b/Stuff/Personal/Uni/InnoStuff/F23/Practical Machine Learning/PMLDL-text-detox/data/interim/condbert_vocab/test
--2023-11-05 20:51:45--  https://raw.githubusercontent.com/s-nlp/detox/main/emnlp2021/data/train/train_toxic
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10514256 (10M) [text/plain]
Saving to: ‘/home/b0b/Stuff/Personal/Uni/InnoStuff/F23/Practical Machine Learning/PMLDL-text-detox/data/interim/condbert_vocab/train/train_toxic’


2023-11-05 20:51:45 (21,6 MB/s) - ‘/home/b0b/Stuff/Personal/Uni/InnoStuff/F23/Practical Machine Learning/PMLDL-text-detox/data/interim/

Now, we need to process *ParaNMT* and *ParaDetox* datasets and add them to the existing train data.

In [13]:
paranmt_path = '../data/raw/filtered.tsv'

train_toxic = '../data/interim/condbert_vocab/train/train_toxic'
train_normal = '../data/interim/condbert_vocab/train/train_normal'

In order to process the *ParaNMT* dataset, we need to gather all texts with high toxicity scores into `train_toxic` dataset and put the rest into `train_normal` dataset. We will put all reference texts with `ref_tox` higher than 0.8, the cutoff for neutral dataset will be 0.2. The decision to use only reference texts in the training is motivated by better stability, since the translation can affect fluency of the model significantly. Also we will need to separate the punctuation marks with a whitespace. 

In [14]:
paranmt_df = pd.read_csv(paranmt_path, sep="\t", index_col=0)
tokenizer = WordPunctTokenizer()
toxic_ref = list(map(lambda x: f'{" ".join(tokenizer.tokenize(x))}\n', paranmt_df[paranmt_df.ref_tox >= 0.8]['reference'].tolist()))
with open(train_toxic, 'a') as f:
    f.writelines(toxic_ref)
    
neutral_ref = list(map(lambda x: f'{" ".join(tokenizer.tokenize(x))}\n', paranmt_df[paranmt_df.ref_tox <= 0.2]['reference'].tolist()))
with open(train_normal, 'a') as f:
    f.writelines(neutral_ref)

## Step 2: Calculating the toxicity of each token

In order to calculate the toxicity, we will score each token by the frequency it appears in the chosen corpus opposed to the number of appearances in another corpora.

In [15]:
class NgramSalienceCalculator:
    def __init__(self, tox_corpus, norm_corpus):
        ngrams = (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams)
        tox_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_count = np.sum(tox_matrix, axis=0)
        
        norm_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_count = np.sum(norm_matrix, axis=0)
        
    def calculate(self, feature, attr='tox', eps=0.5):
        assert attr in ['tox', 'norm']
        tox_cnt = self.tox_count[0, self.tox_vocab[feature]] if feature in self.tox_vocab else 0.0
        norm_cnt = self.norm_count[0, self.norm_vocab[feature]] if feature in self.norm_vocab else 0.0
        if attr == 'tox':
            return (tox_cnt + eps) / (norm_cnt + eps)
        else:
            return (norm_cnt + eps) / (tox_cnt + eps)

Setting up the counter for words:

In [16]:
cnt = Counter()
for filename in [train_toxic, train_normal]:
    with open(filename, 'r') as f:
        for line in f.readlines():
            for token in line.strip().split():
                cnt[token] += 1
len(cnt)

160243

We will use every word that has at least 1 occurrence in the vocabulary.

In [17]:
vocab = {word for word, count in cnt.most_common() if count > 0}
len(vocab)

160243

Creating corpora:

In [18]:
with open(train_normal, 'r') as normal, open(train_toxic, 'r') as toxic:
    tox_corpus = [' '.join([word if word in vocab else '<unk>' for word in line.strip().split()]) for line in toxic.readlines()]
    norm_corpus = [' '.join([word if word in vocab else '<unk>' for word in line.strip().split()]) for line in normal.readlines()]

pos_words = '../data/interim/condbert_vocab/positive_words.txt'
neg_words = '../data/interim/condbert_vocab/negative_words.txt'

Calculating the toxicity scores:

In [19]:
calc = NgramSalienceCalculator(tox_corpus, norm_corpus)
used_ngrams = set()
threshold = 4
with open(pos_words, 'w') as pos_file, open(neg_words, 'w') as neg_file:
    for gram in set(calc.tox_vocab.keys()).union(set(calc.norm_vocab.keys())):
        if gram in used_ngrams:
            continue
        used_ngrams.add(gram)
        tox_score, norm_score = calc.calculate(gram, attr='tox'), calc.calculate(gram, attr='norm')
        if tox_score > threshold:
            neg_file.writelines(f'{gram}\n')
        elif norm_score > threshold:
            pos_file.writelines(f'{gram}\n')

## Step 3: Training word2coeff with Logistic Regression

We will use Logistic Regression to predict the toxicity of the words.

In [20]:
pipe = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000, n_jobs=-1))
X_train, y_train = tox_corpus + norm_corpus, [1] * len(tox_corpus) + [0] * len(norm_corpus)
pipe.fit(X_train, y_train)

In [21]:
coeffs = pipe[1].coef_[0]
coeffs.shape

(136125,)

In [22]:
word2coef = {word: coeffs[idx] for word, idx in pipe[0].vocabulary_.items()}

In [23]:
with open('../data/interim/condbert_vocab/word2coef.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

## Step 4: Labelling tokens by toxicity using BERT

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [25]:
toxic_cnt, norm_cnt = defaultdict(lambda: 1), defaultdict(lambda: 1)
for text in tox_corpus:
    for token in tokenizer.encode(text):
        toxic_cnt[token] += 1

for text in norm_corpus:
    for token in tokenizer.encode(text):
        norm_cnt[token] += 1

In [26]:
token_toxicities = [toxic_cnt[i] / (norm_cnt[i] + toxic_cnt[i]) for i in range(len(tokenizer.vocab))]
with open('../data/interim/condbert_vocab/token_toxicities.txt', 'w') as f:
    for score in token_toxicities:
        f.write(str(score) + '\n')