In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import chromadb
import plotly.express as px

MODEL_NAME = "msmarco-MiniLM-L-6-v3"
# MODEL_NAME = "all-mpnet-base-v2"
flatModelName = MODEL_NAME.replace('-', '')
path = f'./{flatModelName}'
chroma_client = chromadb.PersistentClient(path=path)

chroma_collection = chroma_client.get_or_create_collection(name="movies",
    metadata={"hnsw:space": "cosine"})

allMovies = chroma_collection.get(include=["embeddings", 'metadatas'])

print(f"Number of movies: {len(allMovies['embeddings'])}")

# Create a dataframe
df = pd.DataFrame(allMovies['embeddings'])

# PCA for dimensionality reduction
pca = PCA(n_components=2)  # reduce to 2 dimensions for visualization
pca_result = pca.fit_transform(df.values)

df['pca-one'] = pca_result[:, 0]
df['pca-two'] = pca_result[:, 1]

# t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df.values)

df['tsne-one'] = tsne_results[:, 0]
df['tsne-two'] = tsne_results[:, 1]
df['title'] = [m['title'] for m in allMovies['metadatas']]

# Plotting using Plotly
fig_pca = px.scatter(df, x='pca-one', y='pca-two', hover_data=['title'])
fig_pca.update_layout(title='PCA of Movie Dataset', xaxis_title='PCA 1', yaxis_title='PCA 2')

fig_tsne = px.scatter(df, x='tsne-one', y='tsne-two', hover_data=['title'])
fig_tsne.update_layout(title='t-SNE of Movie Dataset', xaxis_title='t-SNE 1', yaxis_title='t-SNE 2')

# Display the plots
fig_pca.show()
fig_tsne.show()


Number of movies: 17209
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 17209 samples in 0.019s...
[t-SNE] Computed neighbors for 17209 samples in 1.082s...
[t-SNE] Computed conditional probabilities for sample 1000 / 17209
[t-SNE] Computed conditional probabilities for sample 2000 / 17209
[t-SNE] Computed conditional probabilities for sample 3000 / 17209
[t-SNE] Computed conditional probabilities for sample 4000 / 17209
[t-SNE] Computed conditional probabilities for sample 5000 / 17209
[t-SNE] Computed conditional probabilities for sample 6000 / 17209
[t-SNE] Computed conditional probabilities for sample 7000 / 17209
[t-SNE] Computed conditional probabilities for sample 8000 / 17209
[t-SNE] Computed conditional probabilities for sample 9000 / 17209
[t-SNE] Computed conditional probabilities for sample 10000 / 17209
[t-SNE] Computed conditional probabilities for sample 11000 / 17209
[t-SNE] Computed conditional probabilities for sample 12000 / 17209
[t-SNE] Computed conditio