In [15]:
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import uuid
import pprint as pp


In [16]:
import torch
torch.set_num_threads(1)

In [68]:
# Install necessary packages (uncomment the line below if running in a new environment)
# !pip install sentence-transformers hdbscan

import json
from sentence_transformers import SentenceTransformer
import hdbscan
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import numpy as np

# Load the JSON data from map.json
with open("map.json", "r") as f:
    data = json.load(f)

# Custom cosine distance function
def cosine_distance(u, v):
    # Ensure inputs are numpy arrays
    u, v = np.array(u), np.array(v)
    # Compute cosine distance: 1 - cosine similarity
    return 1 - np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def get_cluster(data):
    # Extract the "fact" values for embedding
    facts = [item["fact"] for item in data]

    # Load the embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Compute embeddings for each fact (converting to numpy arrays)
    embeddings = model.encode(facts, convert_to_numpy=True)

    embeddings = normalize(embeddings)


    # Cluster the embeddings using HDBSCAN.
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2,metric= cosine_distance, cluster_selection_epsilon=0.2)
    cluster_labels = clusterer.fit_predict(embeddings)


    # Organize the items by their assigned cluster
    clusters = {}
    for idx, label in enumerate(cluster_labels):
        # Initialize list for this label if not seen before
        clusters.setdefault(label, []).append(data[idx])

    # Re-map cluster labels to sequential cluster_id values starting from 1.
    # If any items are marked as noise (label == -1), they will be assigned individual cluster_ids.
    final_clusters = []
    next_cluster_id = 1

    # First, handle non-noise clusters
    for label in sorted(clusters.keys()):
        if label != -1:
            final_clusters.append({
                "cluster_id": next_cluster_id,
                "atomic facts": clusters[label]
            })
            next_cluster_id += 1

    # Then, assign each noise point its own cluster (if any)
    if -1 in clusters:
        for item in clusters[-1]:
            final_clusters.append({
                "cluster_id": next_cluster_id,
                "atomic facts": [item]
            })
            next_cluster_id += 1


    # Print the final clustered result in JSON format
    with open("clustered.json", "w") as f:
        json.dump(final_clusters, f, indent=2)

    return final_clusters

In [69]:
import pipeline


In [70]:
res = get_cluster(data)
with open("clustered_test.json", "w") as f:
    json.dump(res, f, indent=2)

print(f"there are {len(res)} clusters")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.76it/s]


there are 92 clusters


In [62]:
# blurry graph
# image = "https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_val_00000943.jpg"
# prompt = "Describe the information shown in this screen"

# Art
image = "https://m.media-amazon.com/images/I/71jD1RoJ7jL._AC_UL320_.jpg"
prompt = "Describe the art for me. I am thinking of purchasing it and put it in my living room. What do you think?"



# models = ["gpt","gemini","claude"]
# models = ["gpt","gemini"]
models = ["gpt"]

In [71]:
for i in range(1,11):
    _, res = pipeline.variation_generation(image, i, models, "original", prompt, "./", source="url")
    final_clusters = get_cluster(res)
    print (f"for trial size {i}, there are {len(final_clusters)} clusters. In total, there are {len(res)} atomic facts, ratio is {len(final_clusters)/len(res)}")
    
    


1


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.86it/s]


for trial size 1, there are 9 clusters. In total, there are 28 atomic facts, ratio is 0.32142857142857145
2


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.74it/s]


for trial size 2, there are 20 clusters. In total, there are 52 atomic facts, ratio is 0.38461538461538464
3


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 3/3 [00:01<00:00,  2.85it/s]


for trial size 3, there are 30 clusters. In total, there are 67 atomic facts, ratio is 0.44776119402985076
4


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 3/3 [00:00<00:00,  5.04it/s]


for trial size 4, there are 31 clusters. In total, there are 91 atomic facts, ratio is 0.34065934065934067
5


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 4/4 [00:00<00:00,  6.76it/s]


for trial size 5, there are 47 clusters. In total, there are 113 atomic facts, ratio is 0.415929203539823
6


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 5/5 [00:00<00:00,  6.02it/s]


for trial size 6, there are 59 clusters. In total, there are 143 atomic facts, ratio is 0.4125874125874126
7


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 5/5 [00:00<00:00,  5.87it/s]


for trial size 7, there are 42 clusters. In total, there are 142 atomic facts, ratio is 0.29577464788732394
8


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 5/5 [00:00<00:00,  5.41it/s]


for trial size 8, there are 81 clusters. In total, there are 158 atomic facts, ratio is 0.5126582278481012
9


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.85it/s]


for trial size 9, there are 105 clusters. In total, there are 217 atomic facts, ratio is 0.4838709677419355
10


INFO:pipeline:Descriptions generated
INFO:pipeline:Descriptions broken down
INFO:pipeline:Atomic facts broken down
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


for trial size 10, there are 97 clusters. In total, there are 205 atomic facts, ratio is 0.47317073170731705


In [65]:
final_clusters = get_cluster(res)

# save the final clusters
with open("final_clustered.json", "w") as f:
    json.dump(final_clusters, f, indent=2)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 4/4 [00:00<00:00,  5.30it/s]


In [72]:
# only get response id, description, and model from the description.json and store it in a new json file

with open("./data/map/descriptions.json", "r") as f:
    data = json.load(f)

new_data = []
for item in data.values():
    new_data.append({
        "response_id": item["id"],
        "description": item["description"],
        "model": item["model"]
    })

with open("description_map.json", "w") as f:
    json.dump(new_data, f, indent=2)