In [1]:
# !conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
# !pip install transformers[torch] datasets rouge_score nlp numpy pandas matplotlib

In [2]:
import pandas as pd
import torch

In [3]:
filtered_dataset = pd.read_csv('filtered.tsv', sep='\t')
filtered_dataset.rename(columns={filtered_dataset.columns[0]: "id"}, inplace=True)

sorted = filtered_dataset.sort_values(by=['ref_tox'], ascending=False)
sorted.head()

Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
551255,551255,His father would have used a booming voice to ...,his father would have answered with his thunde...,0.729428,0.091954,0.999724,0.004599
101676,101676,You have to send those idiots back in.,you have to get those guys back there.,0.622852,0.0,0.999723,0.000115
258368,258368,Salina could be with that stupid cop.,Salina could be with the cop.,0.774944,0.210526,0.999723,0.0005
318050,318050,And don't let those idiots in radiology hold y...,don't let them fool you in radiology.,0.711188,0.283019,0.999723,0.000874
70934,70934,My idiot friend here brought marijuana... - on...,my friend here took a marijuana...,0.715508,0.396552,0.999722,0.000161


In [4]:
from torch.utils.data import Dataset, random_split


class ToxicDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ref = "Detoxify: " + self.dataframe.iloc[idx, 1]
        trn = self.dataframe.iloc[idx, 2]

        return ref, trn

In [5]:
import datasets


# torch_dataset = ToxicDataset(sorted)

# train_size = int(0.95 * len(torch_dataset))
# test_size = len(torch_dataset) - train_size

# train_dataset, eval_dataset = random_split(torch_dataset, [train_size, test_size])

# def gen_train():
#     return {
#         "source_text": [x[0] for x in torch_dataset],
#         "target_text": [x[1] for x in torch_dataset],
#     }

# train_df = gen_train()

def construct_dataset(tokenizer, dataset: Dataset) -> datasets.Dataset:
    def tokenize_function(s):
        return tokenizer(s, padding="max_length", truncation=True, return_tensors="pt").input_ids.squeeze(0)

    def gen():
        for input, output in dataset:
            yield {
                "input_ids": tokenize_function(input),
                "labels": tokenize_function(output),
            }

    hf_dataset = datasets.Dataset.from_generator(gen)
    return hf_dataset

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import nlp
import logging

logging.basicConfig(level=logging.INFO)

# We don't plan to modify the tokenizer, so we may always use the pretrained one
tokenizer = T5Tokenizer.from_pretrained("./t5_output/")

# train_hf_dataset = construct_dataset(tokenizer, train_dataset)
# eval_hf_dataset = construct_dataset(tokenizer, eval_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:

rouge = nlp.load_metric("rouge", experiment_id=1)
model = None

def fine_tune_t5(model_name):
    global model
    del model
    torch.cuda.empty_cache()
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    def compute_metrics(pred):
        print(pred)

        labels_ids = pred.label_ids
        pred_ids = pred.predictions

        print(labels_ids)
        print(pred_ids)

        # all unnecessary tokens are removed
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.eos_token_id
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

        rouge_output = rouge.compute(
            predictions=pred_str,
            references=label_str,
            rouge_types=["rouge2"]
        )["rouge2"].mid

        return {
            "rouge2_precision": round(rouge_output.precision, 4),
            "rouge2_recall": round(rouge_output.recall, 4),
            "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
        }

    args = TrainingArguments(
        output_dir="./t5_output",
        overwrite_output_dir=True,
        save_total_limit=2,
        # evaluation_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=12,
        save_steps=1000,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_hf_dataset,
        eval_dataset=eval_hf_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    model.save_pretrained("./t5_output")
    tokenizer.save_pretrained("./t5_output")

    del trainer

INFO:nlp.load:Checking /root/.cache/huggingface/datasets/5ecb6e4b474317b41ae1fe5d702d1af8d86d452f0b1d70f77a12f6f014ded6ac.35bc2c477aa456d2f589656477ccb0b463c21cdfb83a9de86d63de8560a96d1b.py for additional imports.
INFO:nlp.load:Found main folder for metric https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py at /root/anaconda3/envs/torch/lib/python3.11/site-packages/nlp/metrics/rouge
INFO:nlp.load:Found specific version folder for metric https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py at /root/anaconda3/envs/torch/lib/python3.11/site-packages/nlp/metrics/rouge/06783dbed5f6b6a5413f84d2a5f0d9dc9cb871f1aeb3787f2c90a8e3fe60b1c1
INFO:nlp.load:Found script file from https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py to /root/anaconda3/envs/torch/lib/python3.11/site-packages/nlp/metrics/rouge/06783dbed5f6b6a5413f84d2a5f0d9dc9cb871f1aeb3787f2c90a8e3fe60b1c1/rouge.py
INFO:nlp.load:Couldn't find dataset infos file at htt

In [8]:
# del model
# del trainer
# torch.cuda.empty_cache()

In [9]:
# fine_tune_t5("google/flan-t5-small")

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("./t5_output")
tokenizer = T5Tokenizer.from_pretrained("./t5_output")

def evaluate(s: str) -> str:
    encoded_input = tokenizer(s, padding="max_length", truncation=True, return_tensors='pt')
    output = model.generate(encoded_input.input_ids)
    output_str = tokenizer.batch_decode(output, skip_special_tokens=True)
    # print(output_str)
    return output_str

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
result = evaluate("Detoxify: shut up please you moron.")
print(result)

['shut up please, moron.']


In [17]:
result = evaluate("Detoxify: why didn't you tell this shit, the day sucks now.")
print(result)

["why didn't you tell this, the day is shit."]
