# 1 Imports

In [1]:
import os
import re
import math
import json
from tqdm import tqdm
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from sklearn.manifold import TSNE
import plotly.graph_objects as go

# 2 Environment Configuration

In [2]:
load_dotenv()
os.environment = os.getenv("OPENAI_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [3]:
hf_token = os.getenv("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# 3 Load Data from HuggingFace

In [25]:
# With train.pkl in this folder, you can run this:

with open('train.pkl', 'rb') as file:
    train = pickle.load(file)

# 4 Create Chroma DB

In [6]:
path_db = "products_vectorstore"
client = chromadb.PersistentClient(path=path_db)

In [7]:
collection_name = "products"

# Agora list_collections retorna apenas strings com os nomes das coleções
existing_collection_names = client.list_collections()

if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)

Deleted existing collection: products


# 5 Load Embedder

In [8]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
vector = model.encode(["Well hi there"])[0]

In [10]:
len(vector)

384

In [11]:
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

In [12]:
description(train[0])

Empty

In [19]:
for i in tqdm(range(0, len(train), 1000)):
    documents = [description(item) for item in train[i: i+1000]]
    vectors = model.encode(documents).astype(float).tolist()
    metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
    ids = [f"doc_{j}" for j in range(i, i+1000)]
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=vectors,
        metadatas=metadatas
    )

  0%|          | 0/400 [00:00<?, ?it/s]


ValueError: Expected document to be a str, got Empty in add.