In [48]:
from tqdm import tqdm
from urllib.request import urlretrieve
import zipfile
import os
import pandas as pd
import warnings
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import ast
import re

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def retrieve_save(url, filename, directory = './'):
    print("Retrieving a document by URL")
    isExist = os.path.exists(directory)
    if not isExist: os.makedirs(directory)
    urlretrieve(url, directory + filename)
    print("Successfully retrieved")

## Loading preprocessed main dataset by url

In [3]:
raw_url = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
raw_filename = "filtered_paranmt.zip"
raw_dir = './data/raw'

retrieve_save(raw_url, raw_filename)

with zipfile.ZipFile(raw_filename, 'r') as zip_ref:
    zip_ref.extractall(raw_dir)

unziped_file = os.listdir(raw_dir)[0]
full_unziped_file = raw_dir + '/' + unziped_file
full_unziped_file

Retrieving a document by URL
Successfully retrieved


'./data/raw/filtered.tsv'

In [4]:
df_raw = pd.read_csv(full_unziped_file, sep = '\t', index_col = 0)
df_raw

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


In [23]:
tmp_df = df_raw.drop(['similarity', 'lenght_diff'], axis=1)

tmp_df['ref_tox'] = tmp_df['ref_tox'].apply(lambda x: 1 if x>0.5 else 0)
tmp_df['trn_tox'] = tmp_df['trn_tox'].apply(lambda x: 1 if x>0.5 else 0)
binary_df = pd.DataFrame([], columns=["sentence", "label"])
binary_df["sentence"] = tmp_df['reference']
binary_df["label"] = tmp_df['ref_tox']
binary_df = pd.concat([binary_df,
                       tmp_df[['translation', 'trn_tox']].rename(columns={"translation": "sentence",
                                                                          "trn_tox": "label"})],
                      axis = 0, ignore_index=True)
binary_df.head()

Unnamed: 0,sentence,label
0,"If Alkar is flooding her with psychic waste, t...",0
1,Now you're getting nasty.,0
2,"Well, we could spare your life, for one.",0
3,"Ah! Monkey, you've got to snap out of it.",0
4,I've got orders to put her down.,0


In [24]:
directory_binary_toxic = "./data/interm"
filename_binary_toxic = directory_binary_toxic + "/toxic_binary.csv"

isExist = os.path.exists(directory_binary_toxic)
if not isExist: os.makedirs(directory_binary_toxic)
binary_df.to_csv(filename_binary_toxic)

## Loading filtered main dataset from file and preprocess it

In [25]:
directory_binary_toxic = "./data/interm"
filename_binary_toxic = directory_binary_toxic + "/toxic_binary.csv"

train_bin = pd.read_csv(filename_binary_toxic, index_col = 0)
train_bin

Unnamed: 0,sentence,label
0,"If Alkar is flooding her with psychic waste, t...",0
1,Now you're getting nasty.,0
2,"Well, we could spare your life, for one.",0
3,"Ah! Monkey, you've got to snap out of it.",0
4,I've got orders to put her down.,0
...,...,...
1155549,you didn't know that Estelle stole your fish f...,1
1155550,you'd be sucked out of your life!,0
1155551,I really can't take this.,0
1155552,"they said I was a hero, but I didn't care.",0


In [26]:
def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    """
    Substitute all punctuations with space in case of
    "there is5dogs".

    If subs with '' -> "there isdogs"
    With ' ' -> there is dogs
    """
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"

    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub(r'[^a-z|\s]+', ' ', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text).strip()
    return text_no_doublespace

In [27]:
class Processing:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer()

    def tokenize_text(self, text: str) -> list[str]:
        return word_tokenize(text)

    def remove_stop_words(self, tokenized_text: list[str]) -> list[str]:
        return [w for w in tokenized_text if not w.lower() in self.stop_words]

    def stem_words(self, tokenized_text: list[str]) -> list[str]:
        return [self.ps.stem(w) for w in tokenized_text]

In [34]:
def preprocessing_stage(text, pr = Processing()):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punctuation(_without_numbers)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = pr.tokenize_text(_single_spaced)
    return _tokenized

def clean_text_inplace(df):
    df['sentence'] = df['sentence'].apply(preprocessing_stage)
    return df

def preprocess(df):
    df.fillna(" ", inplace=True)
    _cleaned = clean_text_inplace(df)
    return _cleaned

In [37]:
train_preprocessed = preprocess(train_bin)
train_preprocessed.head()

Unnamed: 0,sentence,label
0,"[if, alkar, is, flooding, her, with, psychic, ...",0
1,"[now, you, re, getting, nasty]",0
2,"[well, we, could, spare, your, life, for, one]",0
3,"[ah, monkey, you, ve, got, to, snap, out, of, it]",0
4,"[i, ve, got, orders, to, put, her, down]",0


In [38]:
directory_binary_prep = "./data/interm"
filename_binary_prep = directory_binary_prep + "/toxic_binary_preprocessed.csv"

isExist = os.path.exists(directory_binary_prep)
if not isExist: os.makedirs(directory_binary_prep)
train_preprocessed.to_csv(filename_binary_prep)

## Loading preprocessed main dataset from file

In [39]:
directory_binary_prep = "./data/interm"
filename_binary_prep = directory_binary_prep + "/toxic_binary_preprocessed.csv"

train_bin_prep = pd.read_csv(filename_binary_prep, index_col = 0)
train_bin_prep

Unnamed: 0,sentence,label
0,"['if', 'alkar', 'is', 'flooding', 'her', 'with...",0
1,"['now', 'you', 're', 'getting', 'nasty']",0
2,"['well', 'we', 'could', 'spare', 'your', 'life...",0
3,"['ah', 'monkey', 'you', 've', 'got', 'to', 'sn...",0
4,"['i', 've', 'got', 'orders', 'to', 'put', 'her...",0
...,...,...
1155549,"['you', 'didn', 't', 'know', 'that', 'estelle'...",1
1155550,"['you', 'd', 'be', 'sucked', 'out', 'of', 'you...",0
1155551,"['i', 'really', 'can', 't', 'take', 'this']",0
1155552,"['they', 'said', 'i', 'was', 'a', 'hero', 'but...",0


# External datasets loading from url

In [44]:
# Loading external dataset preprocessed by hexinz
url_toxic_span = "https://raw.githubusercontent.com/hexinz/SI630_final_project/main/Data/train.csv"
directory_toxic_span = "./data/external"
filename_toxic_span = "/toxic_span.csv"

retrieve_save(url_toxic_span, filename_toxic_span, directory_toxic_span)
directory_toxic_span + filename_toxic_span

Retrieving a document by URL
Successfully retrieved


'./data/external/toxic_span.csv'

In [43]:
# Loading external dataset raw
url_tsd_toxic_span = "https://raw.githubusercontent.com/hexinz/SI630_final_project/main/Data/tsd_train.csv"
directory_tsd_toxic_span = "./data/external"
filename_tsd_toxic_span = "/tsd_toxic_span.csv"

retrieve_save(url_tsd_toxic_span, filename_tsd_toxic_span, directory_tsd_toxic_span)
directory_tsd_toxic_span + filename_tsd_toxic_span

Retrieving a document by URL
Successfully retrieved


'./data/external/tsd_toxic_span.csv'

In [46]:
df_tsd_span = pd.read_csv(directory_tsd_toxic_span + filename_tsd_toxic_span)
df_tsd_span

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."
...,...,...
7934,"[8, 9, 10, 11]",Another fool pipes in.
7935,"[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 5...",So if a restaurant owner puts up a sign saying...
7936,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Any faith that can't stand up to logic and rea...
7937,"[5, 6, 7, 8, 9, 10, 11]",This idiotic. Use the surplus to pay down the ...


In [49]:
def toxic_subseq(indexes):
    if len(indexes) == 0:
        return []
    start = indexes[0]
    array = []
    for i in range(len(indexes)-1):
        if indexes[i] + 1 == indexes[i + 1]:
            continue
        else:
            array.append((start, indexes[i] + 1))
            start = indexes[i + 1]
    array.append((start, indexes[i + 1] + 1))
    return array

def toxifier(words, label):
    return_df = pd.DataFrame([], columns=['tokens', 'toxic?'])
    tokenized = word_tokenize(words)
    for word in tokenized:
        result = re.match('^[\W]*$', word)
        if result is None:
            return_df = return_df.append({"tokens": word.lower(),
                                          "toxic?": label},
                                         ignore_index = True)
    return return_df

def toxic_identifier(indexes, sentence):
    return_df = pd.DataFrame([], columns=['tokens', 'toxic?'])
    if len(indexes) == 0:
        return return_df
    start, _ = indexes[0]
    flag = start == 0
    end = 0
    i = 0
    while i < len(indexes):
        if flag:
            start, end = indexes[i]
            end += 1
            i += 1
        else:
            if end == 0:
                new_end, _ = indexes[i]
                start, end = end, new_end
            else:
                new_end, _ = indexes[i]
                start, end = end - 1, new_end

        return_df = return_df.append(toxifier(sentence[start:end], flag),
                                     ignore_index=True)
        flag = not flag
    if end != len(sentence):
        return_df = return_df.append(toxifier(sentence[end:len(sentence)], flag),
                                     ignore_index=True)
    return return_df

In [50]:
new_df = pd.DataFrame([], columns=['sentence_id', 'tokens', 'toxic?'])
for i in tqdm(range(len(df_tsd_span))):
    toxic_indexes = ast.literal_eval(df_tsd_span['spans'].iloc[i])
    sentence = df_tsd_span['text'].iloc[i]
    toxic_subs = toxic_subseq(toxic_indexes)
    return_df = toxic_identifier(toxic_subs, sentence)
    return_df.insert(0, "sentence_id", i, True)
    new_df = new_df.append(return_df, ignore_index=True)
new_df.head(10)

100%|██████████| 7939/7939 [08:37<00:00, 15.33it/s]


Unnamed: 0,sentence_id,tokens,toxic?
0,0,another,False
1,0,violent,True
2,0,and,True
3,0,aggressive,True
4,0,immigrant,True
5,0,killing,False
6,0,a,False
7,0,innocent,False
8,0,and,False
9,0,intelligent,False


In [51]:
directory_toxic_span_compress = "./data/interm"
filename_toxic_span_compress = directory_toxic_span_compress + "/toxic_span_compressed.csv"

isExist = os.path.exists(directory_toxic_span_compress)
if not isExist: os.makedirs(directory_toxic_span_compress)
new_df.to_csv(filename_toxic_span_compress)

# External preprocessed datasets loading from file

In [52]:
directory_toxic_span_compress = "./data/interm"
filename_toxic_span_compress = directory_toxic_span_compress + "/toxic_span_compressed.csv"

train = pd.read_csv(filename_toxic_span_compress, index_col = 0)
train

Unnamed: 0,sentence_id,tokens,toxic?
0,0,another,False
1,0,violent,True
2,0,and,True
3,0,aggressive,True
4,0,immigrant,True
...,...,...,...
272441,7938,out,False
272442,7938,of,False
272443,7938,women,True
272444,7938,'s,True
