# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [1]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions  # per Vivek helped clear embedding function error --> I don't need to specify it so explicity
# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv
from tavily import TavilyClient
from pathlib import Path



In [3]:
# TODO: Create a .env file with the following variables
# OPENAI_API_KEY="YOUR_KEY" - done
# CHROMA_OPENAI_API_KEY="YOUR_KEY" ->>? not found(local copy of db to be used?)
# TAVILY_API_KEY="YOUR_KEY" - done

In [4]:
# TODO: Load environment variables

# Use the .env file in the project root
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai_key = OPENAI_API_KEY
tavily_key = os.getenv('TAVILY_API_KEY')

assert os.getenv('OPENAI_API_KEY')
assert os.getenv('TAVILY_API_KEY')
assert openai_key is not None, "OPENAI_API_KEY missing in .env"
assert tavily_key is not None, "TAVILY_API_KEY missing in .env"
# # tavily_client = TavilyClient(api_key="tvly-YOUR_API_KEY")
# response = tavily_client.search("Who is Leo Messi?")
print(openai_key)
print(tavily_key)
# # print(response)
# # print("Environment loaded ✅")



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
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


In [5]:

import os
from openai import OpenAI

# # --- Insert your key directly or load from .env ---
# API_KEY = "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"

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

try:
    # Simple embedding test
    text = "Hello, this is a test for my OpenAI key."
    response = client.embeddings.create(
        model="text-embedding-3-small",  # or "text-embedding-3-large"
        input=text
    )

    # Print results
    vector = response.data[0].embedding
    print("✅ API key works! Embedding length:", len(vector))
    print("First 5 values:", vector[:5])

except Exception as e:
    print("❌ Test failed. Details:")
    print(e)


✅ API key works! Embedding length: 1536
First 5 values: [0.04916108027100563, -0.008673236705362797, 0.03864111751317978, -0.02621075138449669, 0.008768756873905659]


### VectorDB Instance

In [6]:
# TODO: Instantiate your ChromaDB Client
# Choose any path you want
# chroma_client = chromadb.Client(path="chromadb")
# chroma_client = chromadb.Client()
chroma_client = chromadb.PersistentClient(path="chromadb")  # Successful.  Allows step 20 to run.


### Collection

In [7]:
# TODO: Pick one embedding function
# If picking something different than openai, 
# make sure you use the same when loading it
# embedding_fn = embedding_functions.OpenAIEmbeddingFunction()
EMBED_MODEL = "text-embedding-3-small"


embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key = openai_key,
    model_name=EMBED_MODEL # added per assistant to solve this authentication error message.
)  # per Vivek, don't need to be so explicit.

# embedding_fn = OpenAIEmbeddingFunction(
#     api_key=os.getenv('OPENAI_API_KEY'),
#     model_name="text-embedding-3-large"  # or "text-embedding-3-large" if you want higher quality
# )




In [8]:

##targetted diagnostics


# Quick “can the model embed?” probe via OpenAI SDK directly
# This bypasses Chroma, so if it fails, the root is your OpenAI setup.

from openai import OpenAI
client = OpenAI(api_key=openai_key)

try:
    probe_text = "UdaPlay probe text"
    resp = client.embeddings.create(
        model=EMBED_MODEL,
        input=probe_text
    )
    print("OpenAI embedding probe OK. Vector length:", len(resp.data[0].embedding))
except Exception as e:
    print("OpenAI embedding probe failed:")
    traceback.print_exc()
    print("Likely causes: bad API key, wrong model name, network egress blocked.")
    # Fix: ensure OPENAI_API_KEY is valid; try EMBED_MODEL = "text-embedding-3-small"



OpenAI embedding probe OK. Vector length: 1536


In [9]:
# TODO: Create a collection
# Choose any name you want - ## used collection on the first pass.  Each time after it didn't want to work saying that it had already been created.
#if collection == "":

collection = chroma_client.get_or_create_collection(
    name = "udaplay",
    embedding_function = embedding_fn,
    metadata={"hnsw:space":"cosine"}
)


print(collection._embedding_function.name())



openai


### Add documents

In [1]:
# # Make sure you have a directory "project/starter/games"
data_dir = "games"

ids, docs, metas = [],[], []

for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)


    # Build the content string for embedding
    # content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
    content = f"[{game.get('Platform','')}] {game.get('Name','')} ({game.get('YearOfRelease','')}) - {game.get('Description','')}"

    # Use file name (without extension) as unique ID
    doc_id = os.path.splitext(file_name)[0]
    

    # # Sanitize metadata to ensure JSON-serializable
    # safe_meta = {}
    # for k, v in game.items():
    #     try:
    #         json.dumps(v)  # test serialization
    #         safe_meta[k] = v
    #     except Exception:
    #         safe_meta[k] = str(v)

    # ids.append(doc_id)
    # docs.append(content)
    # metas.append(safe_meta)

    # Add to Chroma collection
#  - API status error message
  
# import traceback
# try:
#     collection.add(ids=[doc_id], documents=[content], metadatas=[game])
# except Exception:
#     traceback.print_exc()  # reveals the code path constructing APIStatusError


# collection.add(
#     ids=[doc_id],
#     documents=[content],
#     metadatas=[game]  # Ensure game dict is JSON-serializable
# )

# except Exception as e:
#      print(f"Failed to add {doc_id}: {e}")

# original caused unhandled authentication error.
# collection.add(
#     ids=[doc_id],
#     documents=[content],
#     metadatas=[game]
# )


try:
    collection.add(ids=[doc_id], documents=[content], metadatas=[game])
except Exception as e:
    print(f"Failed to add {doc_id}: {e}")


NameError: name 'json' is not defined