# OpenAI Embeddings — Test Notebook

Test the OpenAI embedding & ingestion pipeline: config loading, document parsing,
embedding generation, and Pinecone upsert.

Section 1 runs without API keys. Sections 2+ require OpenAI/Pinecone keys.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent if Path.cwd().name == "openai" else Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

## 1. Configuration & Model Info

Test loading config and checking available models.

In [None]:
from tools.openai.OpenAI_embeddings import MODELS, load_config

# Available embedding models
print("Available models:")
for key, info in MODELS.items():
    print(f"  {key}: {info['name']} ({info['dimensions']} dims) — {info['description']}")

In [None]:
# Load config (if available)
config_path = str(PROJECT_ROOT / "_config files" / "config.json")

try:
    cfg, raw_config = load_config(config_path)
    openai_cfg = raw_config.get("openai", {})
    print(f"Pinecone index: {cfg.index_name}")
    print(f"Namespace:      {cfg.namespace}")
    print(f"Embedding model: {openai_cfg.get('embedding_model', 'not set')}")
    print(f"OpenAI key:     {'set' if openai_cfg.get('api_key') else 'missing'}")
except SystemExit as e:
    print(f"Config not available: {e}")

## 2. Parse Knowledge Base Documents

Test parsing .docx files from the test data folder.

In [None]:
from tools.pinecone.parser import parse_docx, parse_kb_text

# Check for test data files
test_data_dir = PROJECT_ROOT / "__test_data"
if test_data_dir.exists():
    docx_files = sorted(test_data_dir.rglob("*.docx"))
    print(f"Found {len(docx_files)} .docx file(s):")
    for f in docx_files:
        print(f"  {f.relative_to(PROJECT_ROOT)}")
else:
    print(f"Test data directory not found: {test_data_dir}")

In [None]:
# Parse a test .docx file (requires python-docx)
if test_data_dir.exists():
    docx_files = sorted(test_data_dir.rglob("*.docx"))
    if docx_files:
        # Use the knowledge base file (not the system message)
        kb_file = [f for f in docx_files if "knowledgeBase" in f.name]
        target = kb_file[0] if kb_file else docx_files[0]
        
        try:
            chunks = parse_docx(str(target))
            print(f"Parsed {len(chunks)} chunk(s) from {target.name}:\n")
            for chunk in chunks[:3]:  # Show first 3
                print(f"  ID:    {chunk['id']}")
                print(f"  Type:  {chunk.get('type', '')}")
                print(f"  Title: {chunk.get('title', '')}")
                print(f"  Text:  {chunk['text'][:100]}...")
                print()
            if len(chunks) > 3:
                print(f"  ... and {len(chunks) - 3} more chunk(s)")
        except ImportError:
            print("Install python-docx: pip install python-docx")
    else:
        print("No .docx files found")

In [None]:
# Parse inline KB text (no file needed)
sample_kb = """
KB_ID: demo-001
TYPE: product
TITLE: Midnight Velvet Dress
TEXT:
A stunning floor-length velvet dress in deep midnight blue.
Available in sizes XS-XL. Price: $289.99.
--- KB_CHUNK_END ---

KB_ID: demo-002
TYPE: policy
TITLE: Gift Wrapping
TEXT:
We offer complimentary gift wrapping on all orders.
Select the gift wrap option at checkout.
--- KB_CHUNK_END ---
"""

demo_chunks = parse_kb_text(sample_kb)
print(f"Parsed {len(demo_chunks)} demo chunk(s):")
for c in demo_chunks:
    print(f"  [{c['id']}] {c['title']}: {c['text'][:60]}...")

## 3. Generate Embeddings (requires OpenAI API key)

Create embedding vectors from the parsed chunks.

In [None]:
from tools.openai.OpenAI_embeddings import make_embed_fn

# Set your OpenAI API key
# OPENAI_API_KEY = "sk-..."

# embed_fn = make_embed_fn(OPENAI_API_KEY, "text-embedding-3-small")
# vector = embed_fn("What is the return policy?")
# print(f"Embedding dimensions: {len(vector)}")
# print(f"First 5 values: {vector[:5]}")

print("Uncomment and set OPENAI_API_KEY to test embeddings.")

## 4. Full Pipeline: Parse, Embed, Upsert (requires both API keys)

Run the complete ingestion pipeline.

In [None]:
from tools.pinecone.vector_store import VectorStore
from tools.pinecone.config import PineconeConfig

# Uncomment after setting up config.json:

# cfg, raw_config = load_config(config_path)
# openai_key = raw_config["openai"]["api_key"]
# embed_fn = make_embed_fn(openai_key, "text-embedding-3-small")

# store = VectorStore(cfg, embed_fn=embed_fn)

# # Upsert demo chunks
# store.upsert_texts(demo_chunks)
# print(f"Upserted {len(demo_chunks)} chunks")

# # Query
# results = store.query_text("Do you gift wrap?", top_k=2)
# for r in results:
#     print(f"  Score: {r['score']:.4f} — {r['metadata'].get('title', '')}")
#     print(f"  {r['metadata'].get('text', '')[:100]}")

# # Stats
# print(f"\nIndex stats: {store.stats()}")

print("Uncomment after setting up config.json with valid API keys.")