In [None]:
import openai
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import json
import tiktoken
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
model="text-embedding-3-small"
tokenizer = tiktoken.encoding_for_model(model)
MAX_TOKENS = 8191
def truncate_to_max_tokens(text, max_tokens=MAX_TOKENS):
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens)

# Load your parsed documents
with open("parsed_docs_cleaned.json", "r", encoding="utf-8") as f:
    docs = json.load(f)

embeddings = []

f# 🔁 Loop through each doc
for doc in tqdm(docs, desc="Embedding documents"):
    title = doc.get("title", "")
    raw_text = doc.get("text", "")
    full_text = f"{title}\n\n{raw_text}".replace("\\n", "\n").strip()
    full_text = truncate_to_max_tokens(full_text)

    try:
        response = client.embeddings.create(
            input=full_text,
            model=model,
            dimensions = 128
        )
        embedding_vector = response.data[0].embedding

        embeddings.append({
            "doc_id": doc.get("url"),
            "title": title,
            "text": full_text,
            "embedding": embedding_vector
        })

    except Exception as e:
        print(f"⚠️ Error embedding doc: {doc.get('url')}\n{e}")

# 💾 Save the result
with open("document_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(embeddings, f, indent=2, ensure_ascii=False)

print("✅ Embedding complete and saved to document_embeddings.json")


Embedding documents: 100%|██████████| 165/165 [01:13<00:00,  2.25it/s]

✅ Embedding complete and saved to document_embeddings.json



