In [None]:
import pandas as pd
import gc

# source CSV: https://huggingface.co/datasets/SetFit/toxic_conversations , file "original.csv"
df = pd.read_csv("../Dataset/setfit_toxicconversations.csv")
df = df[
    [
        "comment_text",
        "target",
        "severe_toxicity",
        "obscene",
        "identity_attack",
        "insult",
        "threat",
        "sexual_explicit",
    ]
]
df["comment_text"] = df["comment_text"].map(lambda x: str(x).strip())

# df.to_csv("./dataset/output_trimmed.csv", index=False)

gc.collect()


In [None]:
# Selecionar 32k itens de cada categoria. Consideramos tóxicos aqueles itens com valor de target >= 0,5.
offensive = df[df["target"] >= 0.5].sample(n=32768, random_state=2003)
inoffensive = df[df["target"] < 0.5].sample(n=32768, random_state=2003)

subset = pd.concat([offensive, inoffensive], ignore_index=True, sort=False)
# subset.to_csv("./dataset/output_subset_64k.csv", index=False)

df = None
offensive = None
inoffensive = None
gc.collect()

In [None]:
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle
import pandas as pd

EMBEDDING_DIMENSIONS = 1024
model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5",
    cache_folder="./models",
    device="cpu",
    trust_remote_code=True,
)

comment_texts = subset["comment_text"].tolist()

# Start the multi-process pool on all available CUDA devices
pool = model.start_multi_process_pool()

# Compute the embeddings using the multi-process pool
# Na RTX 3060: 10min, 28s para computar 64k embeddings com o Stella 400M v5
# Na CPU: 1h40min
embeddings = model.encode_multi_process(comment_texts, pool, show_progress_bar=True)
print("Embeddings computed. Shape:", embeddings.shape)

# Optional: Stop the processes in the pool
model.stop_multi_process_pool(pool)

# Salvar embeddings computadas no arquivo
with open("./embeddings.pickle", "wb") as f:
    pickle.dump(embeddings, f)

X_df = pd.DataFrame(
    data=embeddings, columns=[f"d{i}" for i in range(0, EMBEDDING_DIMENSIONS)]
)
y_df = subset[["target"]]

dataset = pd.concat([y_df, X_df], axis=1)
dataset.to_parquet("./output_embeddings.parquet", index=False)