This is a simple illustration of how we can build a db out of embeddings from a bunch of documents for querying later. 

In [12]:
from tqdm import tqdm

import polars as pl
import chromadb
from chromadb.utils import embedding_functions
#from more_itertools import batched

DATA_PATH = "archive/*.csv"

In [13]:
import os
from dotenv import load_dotenv
import json

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

#google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=api_key)

### Building the Dataset for Embedding

We need a list of:
- ids : we can use as a unique ID for each document
- documents : This contains the actual cotent, from which we can build embedding
- metadata : a lits of dictionaries , where we can have keys that can be used to narrow down our queries  

In [14]:
vehicle_years = [2017]
dtypes = {
        "": pl.Int64,
        "Review_Date": pl.Utf8,
        "Author_Name": pl.Utf8,
        "Vehicle_Title": pl.Utf8,
        "Review_Title": pl.Utf8,
        "Review": pl.Utf8,
        "Rating": pl.Float64,
    }

# Scan the car reviews dataset(s)
car_reviews = pl.scan_csv(DATA_PATH, dtypes=dtypes)

# Extract the vehicle title and year as new columns
# Filter on selected years
car_review_db_data = (
    car_reviews.with_columns(
        [
            (
                pl.col("Vehicle_Title").str.split(
                    by=" ").list.get(0).cast(pl.Int64)
            ).alias("Vehicle_Year"),
            (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
                "Vehicle_Model"
            ),
        ]
    )
    .filter(pl.col("Vehicle_Year").is_in(vehicle_years))
    .select(["Review_Title", "Review", "Rating", "Vehicle_Year", "Vehicle_Model"])
    .sort(["Vehicle_Model", "Rating"])
    .collect()
)

# Create ids, documents, and metadatas data in the format chromadb expects
ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
documents = car_review_db_data["Review"].to_list()
metadatas = car_review_db_data.drop("Review").to_dicts()


  car_reviews = pl.scan_csv(DATA_PATH, dtypes=dtypes)


We can print the first element of the dataset

In [15]:
print(f"{ids[0]}\n{documents[0]}\n{metadatas[0]}")

review0
 On my trip to Maui I rented this van to drive to see if I would buy one in the future.    The handling and acceleration was decent for the  van with 4 kids.  It was my second day and was still driving it around and enjoying my vacation.    Was not aware of all the functions on the key controls.   Just lock and unlock.  I must have accidentally hit the open truck key and was not aware of it beenopen.  There was no beeping sound or flashing alert on the dash like my Toyota van.  I drove the car out of the car port and the trunk window got shattered because it was to high.   I believe this to be a design default.    My lost
{'Review_Title': 'No beeping alerts', 'Rating': 1.0, 'Vehicle_Year': 2017, 'Vehicle_Model': 'Dodge'}


In [16]:
#We save the data in the format chromadb expects for the another project

json_data = {i:{"metadata":k,"review":j} for i,j,k in zip(ids, documents, metadatas)}

json_object = json.dumps(json_data, indent=4)
 
# Writing to sample.json
with open("archive/car_reviews.json", "w") as outfile:
    outfile.write(json_object)

#### Building the DB

We can now instantiate the Chroma Client

- This will take a path
- A name
- An embedding function
    - For this exercise we use a language model internal to chromaDB
    - However we can use Google or OpenAI as well

After instantiating we just feed the dataset and the collection is created   

In [19]:
CHROMA_PATH = "car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

In [20]:
chroma_client = chromadb.PersistentClient(CHROMA_PATH)

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

try:
    chroma_client.delete_collection(name=COLLECTION_NAME)
except ValueError:
    print("Collection does not exist. Creating Now")

collection = chroma_client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

document_indices = list(range(len(documents)))

collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas,
)