In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from pypdf import PdfReader
import numpy as np
from pathlib import Path


In [None]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPEN_AI_API_KEY"))

assert os.getenv("OPEN_AI_API_KEY"), "❌ OPEN_AI_API_KEY not found"
print("✅ API key loaded")


In [None]:
DATA_DIR = Path("../data")

def load_documents():
    docs = []

    # PDFs
    pdf_dir = DATA_DIR / "pdf_files"
    for pdf in pdf_dir.glob("*.pdf"):
        reader = PdfReader(pdf)
        text = "\n".join(page.extract_text() or "" for page in reader.pages)
        docs.append({"source": pdf.name, "text": text})

    # TXTs
    txt_dir = DATA_DIR / "text_files"
    for txt in txt_dir.glob("*.txt"):
        text = txt.read_text(encoding="utf-8")
        docs.append({"source": txt.name, "text": text})

    return docs

documents = load_documents()
print(f"✅ Loaded {len(documents)} documents")


In [None]:
def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks


In [None]:
chunks = []
metadata = []

for doc in documents:
    for chunk in chunk_text(doc["text"]):
        chunks.append(chunk)
        metadata.append(doc["source"])

print(f"✅ Created {len(chunks)} chunks")


In [None]:
def embed_texts(texts):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [e.embedding for e in response.data]


In [None]:
embeddings = embed_texts(chunks)
embedding_matrix = np.array(embeddings)

print("✅ Embeddings created")
print("Embedding shape:", embedding_matrix.shape)
