### Necessary Imports

In [9]:
import chromadb
from chromadb.utils import embedding_functions
import json
import os
from openai import OpenAI
from dotenv import load_dotenv

### Load Text Datasets

This cell loads the preprocessed mobile plan datasets from JSONL files into memory for further indexing.

In [10]:
def load_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]
    

plan_prices = load_jsonl("data/plan_prices_texts.jsonl")
brand_migration_deals = load_jsonl("data/brand_migration_deals_texts.jsonl")

### Initialize ChromaDB

Set up a persistent Chroma database and create (or retrieves) collections for plan prices and brand migration deals.

In [11]:
chroma_client = chromadb.PersistentClient(path="chromadb/mobile_advisor")

plan_prices_collection = chroma_client.get_or_create_collection("plan_prices")
brand_migrations_collection = chroma_client.get_or_create_collection("brand_migration_deals")

### Embed and Index Datasets

This cell connects to the OpenAI API to generate embeddings using text-embedding-3-large, creates or retrieves collections with embedding support, and indexes all plan texts along with their metadata.

In [12]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")


def get_or_create_collection(client, name, embedding_function):
    existing_collections = [c.name for c in client.list_collections()]

    if name in existing_collections:
        print(f"Collection: {name} already exists.")
        return client.get_collection(name=name)
    
    else:
        print("Collection: {name} created")
        return client.create_collection(name=name, embedding_function=embedding_function)
    

embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=api_key,
    model_name="text-embedding-3-large"
)

plan_prices_collection = get_or_create_collection(chroma_client, "plan_prices", embedding_function)
brand_migrations_collection = get_or_create_collection(chroma_client, "brand_migration_deals", embedding_function)


def index_dataset(collection, data):
    texts = [d["text"] for d in data]
    metadatas = [d["metadata"] for d in data]
    ids = [f"{m['provider']}_{m['plan_name']}" for m in metadatas]

    collection.add(ids=ids, documents=texts, metadatas=metadatas)


index_dataset(plan_prices_collection, plan_prices)
index_dataset(brand_migrations_collection, brand_migration_deals)

Collection: plan_prices already exists.
Collection: brand_migration_deals already exists.


### Update Metadata

This cell defines a helper function to update metadata for a specific entry in a Chroma collection without re-embdedding the text

In [13]:
def update_metadata(collection, item_id, new_metadata):
    existing = collection.get(ids=[item_id])

    if not existing["ids"]:
        print(f"No entry found for ID {item_id}")
        return
    
    collection.update(ids=[item_id], metadatas=[new_metadata])
    print(f"Metadata for {item_id} updated.")
