In [2]:
from tqdm import tqdm

import polars as pl
import chromadb
from chromadb.utils import embedding_functions
#from more_itertools import batched

DATA_PATH = "archive/*.csv"

In [3]:
import os
from dotenv import load_dotenv
import json

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=api_key)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
vehicle_years = [2017]
dtypes = {
        "": pl.Int64,
        "Review_Date": pl.Utf8,
        "Author_Name": pl.Utf8,
        "Vehicle_Title": pl.Utf8,
        "Review_Title": pl.Utf8,
        "Review": pl.Utf8,
        "Rating": pl.Float64,
    }

# Scan the car reviews dataset(s)
car_reviews = pl.scan_csv(DATA_PATH, dtypes=dtypes)

# Extract the vehicle title and year as new columns
# Filter on selected years
car_review_db_data = (
    car_reviews.with_columns(
        [
            (
                pl.col("Vehicle_Title").str.split(
                    by=" ").list.get(0).cast(pl.Int64)
            ).alias("Vehicle_Year"),
            (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
                "Vehicle_Model"
            ),
        ]
    )
    .filter(pl.col("Vehicle_Year").is_in(vehicle_years))
    .select(["Review_Title", "Review", "Rating", "Vehicle_Year", "Vehicle_Model"])
    .sort(["Vehicle_Model", "Rating"])
    .collect()
)

# Create ids, documents, and metadatas data in the format chromadb expects
ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
documents = car_review_db_data["Review"].to_list()
metadatas = car_review_db_data.drop("Review").to_dicts()


  car_reviews = pl.scan_csv(DATA_PATH, dtypes=dtypes)


In [5]:
metadatas[0]

{'Review_Title': 'No beeping alerts',
 'Rating': 1.0,
 'Vehicle_Year': 2017,
 'Vehicle_Model': 'Dodge'}

In [6]:
json_data = {i:{"metadata":k,"review":j} for i,j,k in zip(ids, documents, metadatas)}

In [7]:
json_object = json.dumps(json_data, indent=4)
 
# Writing to sample.json
with open("archive/car_reviews.json", "w") as outfile:
    outfile.write(json_object)

In [8]:
with open("archive/car_reviews.json", 'r') as openfile:
 
    # Reading from json file
    json_object = json.load(openfile)

In [9]:
json_object.keys()

dict_keys(['review0', 'review1', 'review2', 'review3', 'review4', 'review5', 'review6', 'review7', 'review8', 'review9', 'review10', 'review11', 'review12', 'review13', 'review14', 'review15', 'review16', 'review17', 'review18', 'review19', 'review20', 'review21', 'review22', 'review23', 'review24', 'review25', 'review26', 'review27', 'review28', 'review29', 'review30', 'review31', 'review32', 'review33', 'review34', 'review35', 'review36', 'review37', 'review38', 'review39', 'review40', 'review41', 'review42', 'review43', 'review44', 'review45', 'review46', 'review47', 'review48', 'review49', 'review50'])

In [10]:
CHROMA_PATH = "car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

In [11]:
chroma_client = chromadb.PersistentClient(CHROMA_PATH)


embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

collection = chroma_client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

document_indices = list(range(len(documents)))


collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas,
)

