In [1]:
import chromadb
from datasets import load_dataset
from tqdm import tqdm
import sys
import os

tqdm.pandas()
sys.path.append("../..")

import config
from config import RANDOM_SEED

  from .autonotebook import tqdm as notebook_tqdm


# Load vectorDB and dataset

In [2]:
chroma_client = chromadb.PersistentClient("../db")
collection = chroma_client.get_or_create_collection(
    name="goemotions_train", metadata={"hnsw:space": "cosine"}
)

In [3]:
dataset = load_dataset(
    "go_emotions", "simplified", cache_dir="/home/ossamaak0/.cache/huggingface/datasets"
)
dataset.set_format(type="pandas")
df_train = dataset["train"][:]

# int2label = lambda x: dataset["train"].features["labels"].feature.int2str(x)
# df_train["labels_text"] = df_train["labels"].progress_apply(int2label)

In [4]:
def get_embeddings(df=df_train, idx=0):
    return collection.get(ids=[str(idx)], include=["embeddings"])["embeddings"]

In [104]:
def get_similars(df=df_train, idx=0, n_results=5, verbose=False):
    # get example
    sample = df.iloc[idx]
    text, labels = sample["text"], sample["labels_text"]
    # get embeddings
    embeddings = get_embeddings(df, idx)
    results = collection.query(embeddings, n_results=5)
    # get ids and distances
    ids = [int(_id) for _id in results["ids"][0]]
    distances = [float(d) for d in results["distances"][0]]
    matches = []
    for idx in ids:
        sample = df_train.iloc[idx]
        text = sample["text"]
        labels = sample["labels_text"]
        distance = distances[ids.index(idx)]
        matches.append(
            {
                "idx": idx,
                "text": text,
                "labels": labels,
                "distance": distance,
            }
        )
        if verbose:
            print(f"text: {text}\nlabels: {labels}\n distance: {distance:0.2f}\n")
    return matches

In [105]:
similars = []
for i in tqdm(range(1000)):
    results = get_similars(idx=i, n_results=5)
    distances = [r["distance"] for r in results]
    # if any distance (other than first one) is less than 0.1, add whole list to similars
    if any(d < 0.1 for d in distances[1:]):
        similars.append(results)

100%|██████████| 1000/1000 [00:09<00:00, 100.20it/s]


In [106]:
print(len(similars))

54


In [107]:
similar = similars[1]
s = similar[0]
print("Query:")
print(
    f"idx: {s['idx']}\ntext: {s['text']}\nlabels: {s['labels']}\ndistance: {s['distance']:0.2f}\n"
)
print("*" * 80 + "\n" + "matches:\n")
for s in similar[1:]:
    print(
        f"idx: {s['idx']}\ntext: {s['text']}\nlabels: {s['labels']}\ndistance: {s['distance']:0.2f}\n"
    )

Query:
idx: 35
text: Happy to be able to help.
labels: ['joy']
distance: 0.00

********************************************************************************
matches:

idx: 25963
text: Happy to help
labels: ['joy']
distance: 0.07

idx: 8331
text: happy to help
labels: ['joy']
distance: 0.07

idx: 43315
text: I’m happy I can help.
labels: ['approval', 'joy']
distance: 0.07

idx: 24441
text: Happy to help!
labels: ['joy']
distance: 0.10



# Visualization

In [6]:
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objs as go

In [7]:
# get embeddings of 5000 samples
embeddings = np.array([get_embeddings(df_train, idx) for idx in tqdm(range(5000))])

100%|██████████| 5000/5000 [00:28<00:00, 173.28it/s]


In [9]:
embeddings = embeddings.squeeze()

In [11]:
tsne = TSNE(n_components=3, n_jobs=2, verbose=True)
embeddings_3d = tsne.fit_transform(embeddings)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.003s...
[t-SNE] Computed neighbors for 5000 samples in 0.503s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 3.871073
[t-SNE] KL divergence after 50 iterations with early exaggeration: 86.746162
[t-SNE] KL divergence after 1000 iterations: 2.724712
