In [None]:
import os
from dotenv import load_dotenv

# Change to the 10_Modal directory
os.chdir("10_Modal")

os.environ["PYTHONIOENCODING"] = "utf-8"

load_dotenv(override=True)

In [None]:
import pickle

with open('train_lite.pkl', 'rb') as file:
    train = pickle.load(file)

In [None]:
train[0].prompt

In [None]:
import chromadb

DB = "products_vectorstore"
client = chromadb.PersistentClient(path=DB)

In [None]:
# Check if the collection exists and delete it if it does
collection_name = "products"
existing_collection_names = [name for name in client.list_collections()]
if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Pass in a list of texts, get back a numpy array of vectors

vector = model.encode(["Well hi there"])[0]
vector

In [None]:
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

In [None]:
description(train[0])

In [None]:
from tqdm import tqdm

for i in tqdm(range(0, len(train), 1000)):
    documents = [description(item) for item in train[i: i+1000]]
    vectors = model.encode(documents).astype(float).tolist()
    metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
    ids = [f"doc_{j}" for j in range(i, i+1000)]
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=vectors,
        metadatas=metadatas
    )

In [None]:
CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']
COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']

In [None]:
import numpy as np

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
categories = [metadata['category'] for metadata in result['metadatas']]
colors = [COLORS[CATEGORIES.index(c)] for c in categories]

In [None]:
from sklearn.manifold import TSNE

# 2D
tsne2d = TSNE(n_components=2, random_state=42, n_jobs=-1)
reduced_vectors2d = tsne2d.fit_transform(vectors)

In [None]:
import plotly.graph_objects as go

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors2d[:, 0],
    y=reduced_vectors2d[:, 1],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
    text=[f"<br>Text: {d[:100]}..." for d in documents],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vectorstore Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# 3D

tsne3d = TSNE(n_components=3, random_state=42, n_jobs=-1)
reduced_vectors3d = tsne3d.fit_transform(vectors)

In [None]:
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors3d[:, 0],
    y=reduced_vectors3d[:, 1],
    z=reduced_vectors3d[:, 2],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
    text=[f"<br>Text: {d[:100]}..." for d in documents],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()