## Clustering Embeddings and Analysis

In [None]:
# In this notebook we retrieve uploaded embeddings from qdrant and make elbow and silhouette scores plots, we also do dimensionality
# reduction using UMAP. we compare the plots between clustering the embeddings directly and clustering the UMAP embeddings. 

In [None]:
import os
import cv2
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
import seaborn as sns
from tqdm import tqdm
from PIL import Image
import multiprocessing
import tensorflow as tf
import umap.umap_ as umap
import matplotlib.pyplot as plt
from torchvision import transforms
from sklearn.cluster import KMeans
import matplotlib.patches as mpatches
from tensorboard.plugins import projector
from sklearn.metrics import silhouette_score
from torch.utils.data import Dataset, DataLoader
%matplotlib inline

## Retrieve Embeddings from qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
qdrant = QdrantClient(url='http://132.186.158.40:6333/')

In [None]:
def retrieve_points(collection_name,client,num_vectors):
    res = client.scroll(
    collection_name=collection_name,
    scroll_filter=None,
    limit=num_vectors,
    with_payload=True,
    with_vectors=True,
    )
    return res

ret_points0 = retrieve_points("check1",qdrant,1501)
ret_points1 = retrieve_points("check2",qdrant,1500)

In [None]:
def extract_embeddings(points):
    ret_embeddings = []
    ret_ids = [] #no need to keep anything here since id is just the index of the retrieved vector. (int(ret_points0[0][0].payload['text'].split()[1]))
    for record in points[0]:
        ret_embeddings.append(record.vector)

    return ret_embeddings

ret_embeddings0 = extract_embeddings(ret_points0)
ret_embeddings1 = extract_embeddings(ret_points1)
tot_embeddings = ret_embeddings0+ret_embeddings1

## Get clustering related scores for both embeddings and reduced dimension UMAP embedddings

In [None]:
def calculate_metrics(embeddings, k_range):
  wcss_scores = []
  silhouette_scores = []
  cluster_labels = []
  for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0)  # Set random_state for reproducibility
    kmeans.fit(embeddings)
    cluster_labels.append(kmeans.labels_)
    wcss_scores.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(embeddings, kmeans.labels_))
  return wcss_scores, silhouette_scores, cluster_labels

In [None]:
wcss_scores, silhouette_scores, cluster_labels = calculate_metrics(tot_embeddings,range(2,20))

In [None]:
def reduce_dim(embeddings,num_dim,seed):
    reducer = umap.UMAP(n_components=num_dim,random_state=seed)
    umap_embeddings = reducer.fit_transform(embeddings)
    return umap_embeddings

umap_embeddings = reduce_dim(tot_embeddings,3,42)

In [None]:
umap_wcss_scores, umap_silhouette_scores, umap_cluster_labels = calculate_metrics(umap_embeddings,range(2,20))

## Plot scores

In [None]:
plt.figure(figsize=(10, 6))
# plt.plot(range(2,20), wcss_scores, marker='o', label='WCSS')
plt.plot(range(2,20), silhouette_scores, marker='o', label='Silhouette Score')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Score')
plt.title('Silhouette Score')
plt.legend()
plt.grid(True)
plt.show()