In [None]:
!pip install datasets==2.11
!pip install datasets transformers[sentencepiece] sacrebleu
!pip install transformers[torch]
!pip install accelerate -U
!pip install rouge_score
!pip install evaluate

In [2]:
import os
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
import spacy
from datasets import load_dataset, load_metric, list_metrics
import warnings
warnings.warn("ignore")


def create_dataframe(zip_file_path):
    # check if zip file exists
    if not os.path.exists(zip_file_path):
        print(f"zip file '{zip_file_path}' does not exist.")
        return None

    # extract zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all files in the ZIP archive to a directory
        extraction_path = "./tmp_extraction"
        zip_ref.extractall(extraction_path)

    # find .tsv file
    tsv_file = None
    for root, dirs, files in os.walk(extraction_path):
        for file in files:
            if file.endswith(".tsv"):
                tsv_file = os.path.join(root, file)
                break

    if tsv_file is None:
        print("No .tsv file found in the extracted ZIP archive.")
        return None

    # create pandas DataFrame
    try:
        df = pd.read_csv(tsv_file, delimiter='\t')
        return df
    except Exception as e:
        print(f"Error while creating DataFrame: {str(e)}")
        return None
    finally:
        # remove the temporary extraction directory
        if os.path.exists(extraction_path):
            for root, dirs, files in os.walk(extraction_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    os.remove(file_path)
            os.rmdir(extraction_path)


def create_new_columns(dataset):
    # create new columns
    # dataset['toxic'] = [''] * dataset.shape[0]
    # dataset['nontoxic'] = [''] * dataset.shape[0]
    # dataset['toxic_tox'] = [''] * dataset.shape[0]
    # dataset['nontoxic_tox'] = [''] * dataset.shape[0]

    # create the 'toxic' and 'nontoxic' columns
    dataset['toxic'] = dataset.apply(lambda row: row['reference'] if row['ref_tox'] > row['trn_tox'] else row['translation'], axis=1)
    dataset['nontoxic'] = dataset.apply(lambda row: row['translation'] if row['ref_tox'] > row['trn_tox'] else row['reference'], axis=1)

    # create the 'toxic_tox' and 'nontoxic_tox' columns
    dataset['toxic_tox'] = dataset.apply(lambda row: row['ref_tox'] if row['ref_tox'] > row['trn_tox'] else row['trn_tox'], axis=1)
    dataset['nontoxic_tox'] = dataset.apply(lambda row: row['trn_tox'] if row['ref_tox'] > row['trn_tox'] else row['ref_tox'], axis=1)

    # drop unuseful columns
    dataset = dataset.drop(columns=['reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox'])

    return dataset
    # return dataset['toxic'].tolist(), dataset['nontoxic'].tolist()


def remove_unuseful_data(dataset):
    # toxic sentences > 0.75 of toxic_tox: 0.9243254750535241
    dataset = dataset[dataset['toxic_tox'] > 0.75]

    # detoxed sentences < 0.25 of nontoxic_tox: 0.959344176040237
    dataset = dataset[dataset['nontoxic_tox'] < 0.25]

    return dataset


def get_sentences(dataset):
    # toxic and detoxed sentences
    return dataset['toxic'].tolist(), dataset['nontoxic'].tolist()


def split_train_test(toxic, nontoxic, path):
    toxic_train, toxic_test, nontoxic_train, nontoxic_test = train_test_split(
        toxic,
        nontoxic,
        test_size=0.25,
        random_state=42,
    )

    with open(os.path.join(path, 'toxic_train'), "w", encoding="UTF-8") as file:
        file.write("\n".join(toxic_train))
    with open(os.path.join(path, 'toxic_test'), "w", encoding="UTF-8") as file:
        file.write("\n".join(toxic_test))
    with open(os.path.join(path, 'nontoxic_train'), "w", encoding="UTF-8") as file:
        file.write("\n".join(nontoxic_train))
    with open(os.path.join(path, 'nontoxic_test'), "w", encoding="UTF-8") as file:
        file.write("\n".join(nontoxic_test))

def save_csv(dataset):
    dataset.to_csv("converted.csv")

def get_dict():
    return load_dataset("csv", data_files="converted.csv")



In [3]:
dataset = create_dataframe("drive/MyDrive/filtered_paranmt.zip")

In [4]:
dataset = create_new_columns(dataset)
dataset = remove_unuseful_data(dataset)
save_csv(dataset)
dataset_dict = load_dataset("csv", data_files="converted.csv")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a9dcb23b49dc08f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a9dcb23b49dc08f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

def detoxify_sentence(toxic_sentence, mask_probability=0.1):
    tokenized = tokenizer(toxic_sentence, return_tensors='pt')
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']

    # replace tokens with [MASK] with a certain probability
    for i in range(len(input_ids[0])):
        if torch.rand(1) < mask_probability:
            input_ids[0][i] = tokenizer.mask_token_id

    # generate detoxified candidates
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_token_ids = torch.argmax(outputs.logits, dim=-1)

    detoxified_sentence = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
    return detoxified_sentence

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
toxic_sentence = "It was perfectly planned, but stupidly done."
detoxified = detoxify_sentence(toxic_sentence)
print("Toxic Sentence:", toxic_sentence)
print("Detoxified Sentence:", detoxified)

Toxic Sentence: It was perfectly planned, but stupidly done.
Detoxified Sentence: . it was perfectly planned, but expertly done..
