In [None]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("all-MiniLM-L6-v2")  # 8 seconds for 20k rationales == 2500 rationales per second
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)  # 46 seconds for 20k rationales == 435 rationales per second

In [None]:
import pandas as pd

annotation_paths = {
    "test": [
        "/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_compressed/test.parquet"
    ],
    "train": [
        "/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_compressed/train.part_1_of_2.parquet",
        "/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_compressed/train.part_2_of_2.parquet"
    ],
    "valid": [
        "/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_compressed/valid.parquet"
    ]
}

df = pd.read_parquet(annotation_paths["train"][0])
display(df)

In [None]:
def generate_embeddings(df):
    task = "separation"

    rationale_embeddings = model.encode(
        df["rationale"].values,
        task=task,
        prompt_name=task,
    )

    df["rationale_embedding"] = rationale_embeddings.tolist()
    return df

In [None]:
import os

def save_embeddings(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_parquet(path)
    print(f"Saved embeddings for {path}")

In [None]:
import pandas as pd

for split in annotation_paths:
    for annotation_path in annotation_paths[split]:
        df = pd.read_parquet(annotation_path)
        df = generate_embeddings(df)
        save_embeddings(df, annotation_path.replace("annotations_compressed", "annotations_with_embeddings"))


In [None]:
import numpy as np
import pandas as pd
from cuml.cluster import HDBSCAN

df = pd.read_parquet("/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_with_embeddings/test.parquet")

# clusterer = HDBSCAN(min_cluster_size=10)

clusterer = HDBSCAN(
    min_cluster_size=400,  # Increase to merge smaller clusters
    min_samples=50,        # Increase to avoid over-fragmentation
    cluster_selection_epsilon=0.1,  # Allows merging of nearby clusters
)

# Retrieve embeddings and convert them to a numpy array of shape (N, d) where N is the number of embeddings and d is the dimensionality of the embeddings
embeddings = np.array([embedding for embedding in df["rationale_embedding"].values])
print(embeddings.shape)
# (19962, 1024)

labels = clusterer.fit_predict(embeddings)
display(labels)

# Save the cluster labels next to the embeddings
df["cluster_label"] = labels
df.to_parquet("/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_with_embeddings/test_with_clusters.parquet")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_parquet("/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_with_embeddings/test_with_clusters.parquet")

# Order the cluster labels by the number of rationales in each cluster and show the ten biggest clusters
cluster_sizes = df["cluster_label"].value_counts()
print(cluster_sizes[:10])

# Print ten random rationales from cluster "-1"
print("\n".join(df[df["cluster_label"] == -1].sample(10)["rationale"].values))

sns.histplot(df["cluster_label"], discrete=True)
plt.show()

In [1]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [6]:
test = summarizer(" ".join([
    "I like cats.",
    "I like dogs.",
    "I like animals.",
    "I dislike Peter.",
    "I dislike John.",
    "I dislike people.",
    "I like to eat.",
    "I like to sleep.",
    "I like to relax.",
    "I dislike to work.",
    "I dislike to study.",
    "I dislike to exercise.",
    "I like to play.",
    "I like to watch.",
    "I like to listen.",
    "I dislike to talk.",
    "My favorite color is blue.",
    "Seeing blue things makes me happy.",
    "I like the sky.",
    "I like the ocean a lot.",
]), max_length=100, min_length=50)

display(test)

[{'summary_text': "My favorite color is blue. Seeing blue things makes me happy. I like the ocean a lot. I dislike Peter. I disliked John. I don't like to work. I hate to exercise. I love to eat. I'm not a fan of people."}]

In [2]:
import pandas as pd

def summarize_texts(texts, max_length=100):
    combined_text = " ".join(texts[:1000])  # Limiting input size
    print(f"Summarizing {len(combined_text)} characters")
    return summarizer(combined_text, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']
    

df = pd.read_parquet("/pfss/mlde/workspaces/mlde_wsp_KIServiceCenter/finngu/LlavaGuard/src/experiments/datasets/celeba/results/7B_25_03_06_01/annotations_with_embeddings/test_with_clusters.parquet")
clustered_rationales = df.groupby("cluster_label")["rationale"].apply(list).to_dict()

# for cluster, texts in clustered_rationales.items():
#     print(f"Cluster {cluster}: {len(texts)} rationales")

# Generate summaries per cluster
cluster_summaries = {cluster: summarize_texts(texts) for cluster, texts in clustered_rationales.items()}

for cluster, summary in cluster_summaries.items():
    print(f"Cluster {cluster}: {summary}")

Summarizing 443492 characters


../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [0,0,0], thread: 

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
