## Criação do Dataset

In [None]:
import os
from huggingface_hub import InferenceClient
from google.colab import userdata
token = userdata.get('HF_TOKEN')
client = InferenceClient(
    provider="auto",
    api_key=token
)

In [None]:
ds_all = load_dataset("JailbreakBench/JBB-Behaviors", "behaviors")  # retorna DatasetDict with splits 'harmful' and 'benign'
print(ds_all)  # para inspecionar

# Extrair os splits
ds_harmful = ds_all["harmful"]
ds_benign  = ds_all["benign"]

def to_row_harm(x):
    return {"text": x["Goal"], "label": 1}

def to_row_ben(x):
    return {"text": x["Goal"], "label": 0}

rows_h = [to_row_harm(x) for x in ds_harmful]
rows_b = [to_row_ben(x) for x in ds_benign]

print("counts:", len(rows_h), "harmful |", len(rows_b), "benign")

# criar Dataset HuggingFace
ds = Dataset.from_list(rows_h + rows_b)
ds = ds.shuffle(seed=42)

print(ds.select(range(6)))

In [None]:
ds.to_csv("content/drive/MyDrive/JBB_dataset/train.csv", index=False)


## Data Augmentation

In [None]:
# @title Importar dataset (JÁ CRIADO), realizar data augmentation, salvar df final
import pandas as pd
from datasets import Dataset

df = pd.read_csv("/content/drive/MyDrive/JBB_dataset/train2.csv")
df.head()
df["text"] = df["raw_text"]
ds = Dataset.from_pandas(df)
ds


In [None]:
harmful = ds.filter(lambda x: x["label"] == 1)
safe     = ds.filter(lambda x: x["label"] == 0)

print("Harmful:", len(harmful), "Safe:", len(safe))

### Fazer Data Augmentation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

para_tok = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
para_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to(device)

def paraphrase(text):
    input_text = "paraphrase: " + text + " </s>"

    encoding = para_tok.encode_plus(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(device)

    outputs = para_model.generate(
        **encoding,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
        temperature=1.5
    )

    return [para_tok.decode(o, skip_special_tokens=True) for o in outputs]


In [None]:
aug_rows = []
for item in harmful:
    p_list = paraphrase(item["raw_text"]) 
    for p in p_list:
        aug_rows.append({"raw_text":p,"text": p, "label": 1})


In [None]:
ds_aug = Dataset.from_list(aug_rows)
print(ds_aug)


### Juntar Dataset Aumentado e Dataset original

In [None]:
from datasets import concatenate_datasets

ds_final = concatenate_datasets([harmful, ds_aug, safe]).shuffle(seed=42)
print(ds_final)
ds_final.to_csv("content/drive/MyDrive/JBB_dataset/train_aug.csv", index=False)
