In [2]:
import chromadb
from datasets import load_dataset
from tqdm import tqdm
import sys
import json
from dataset_embeddings import get_embeddings, get_similars
import yaml
import os

tqdm.pandas()
sys.path.append("../..")

with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)
RANDOM_SEED = config["RANDOM_SEED"]

# Create a database for embeddings

In [4]:
chroma_client = chromadb.PersistentClient("db/")
collection = chroma_client.get_or_create_collection(
    name="goemotions_train", metadata={"hnsw:space": "cosine"}
)

In [6]:
dataset = load_dataset(
    "go_emotions", "simplified"
)
dataset.set_format(type="pandas")
df_train = dataset["train"][:]

int2label = lambda x: dataset["train"].features["labels"].feature.int2str(x)
df_train["labels_text"] = df_train["labels"].progress_apply(int2label)

100%|██████████| 43410/43410 [00:01<00:00, 33468.52it/s]


In [29]:
# # # Extract Duplicates
# similars_dict = {}
# for i in tqdm(range(len(df_train))):
#     results = get_similars(df_train, collection, idx=i, n_results=5)
#     distances = [r["distance"] for r in results]
#     # any distance < 0.15 is considered similar, add it to the similars_dict with key i
#     results_list = [r["idx"] for r in results[1:] if r["distance"] < 0.15]
#     if results_list:
#         similars_dict[i] = results_list

In [61]:
# save similar_dicts into similars.json

# Assuming similar_dicts is your dictionary
with open("similars.json", "r") as f:
    similars_dict = json.load(f)

# convert keys to ints
similars_dict = {int(k): v for k, v in similars_dict.items()}
similars_dict_copy = similars_dict.copy()

In [62]:
to_be_dropped = set()
for key, values in similars_dict_copy.items():
    if key in to_be_dropped:
        continue
    for value in values:
        to_be_dropped.add(value)

print(len(to_be_dropped))

2275


In [69]:
# save it to txt_file
with open("duplicates_cosine_n5_th15.txt", "w") as f:
    f.write(",".join(map(str, to_be_dropped)))

In [70]:
# read from txt_file
with open("duplicates_cosine_n5_th15.txt", "r") as f:
    to_be_dropped = set(map(int, f.read().split(",")))

In [72]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [74]:
from datasets import Dataset

In [75]:
# dataset.set_format(type="pandas")
# df_train = dataset["train"][:]
# df_train = df_train.drop(to_be_dropped)
dataset["train"] = Dataset.from_pandas(df_train)

In [76]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'labels_text', '__index_level_0__'],
        num_rows: 41135
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})