
# 🧭 Personal Codex Agent — End‑to‑End (Notebook)

This notebook lets you run the entire project in one place:

1. Install dependencies
2. Configure either **OpenAI** _or_ **Azure OpenAI**
3. Load your personal docs (CV + 2–3 files) from `data/`
4. Build a **FAISS** index with embeddings
5. Ask questions about yourself with **RAG**

> **Tip:** Put your files inside a local `data/` folder next to this notebook before running.


## 1) Install dependencies

In [5]:

# If running in a fresh environment, install requirements.
# You can safely re-run this cell.
import sys, subprocess

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])

pip_install([
    "openai>=1.40",
    "python-dotenv>=1.0",
    "faiss-cpu",
    "pypdf",
    "numpy>=1.26",
    "tiktoken>=0.7",
])
print("✅ Installed / verified packages.")


✅ Installed / verified packages.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## 2) Configure provider (OpenAI **or** Azure OpenAI)

In [6]:

import os
from getpass import getpass

PROVIDER = "openai"  # change to "azure" if you want to use Azure OpenAI

if PROVIDER == "openai":
    print("🔑 Using OpenAI (platform.openai.com)")
    if not os.getenv("OPENAI_API_KEY"):
        os.environ["OPENAI_API_KEY"] = getpass("Paste your OPENAI_API_KEY: ")
    # (Optional) choose models
    CHAT_MODEL = os.getenv("MODEL", "gpt-4o-mini")
    EMBED_MODEL = os.getenv("EMBED_MODEL", "text-embedding-3-small")
else:
    print("🔑 Using Azure OpenAI")
    # You need an Azure OpenAI resource with deployments already created.
    def prompt_if_missing(var, prompt):
        if not os.getenv(var):
            os.environ[var] = input(prompt).strip()
        return os.getenv(var)

    AZURE_OPENAI_ENDPOINT = prompt_if_missing("AZURE_OPENAI_ENDPOINT", "Azure endpoint (e.g. https://YOUR_RESOURCE.openai.azure.com): ")
    AZURE_OPENAI_API_VERSION = prompt_if_missing("AZURE_OPENAI_API_VERSION", "API version (e.g. 2024-06-01): ")
    AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") or getpass("Paste your AZURE_OPENAI_API_KEY: ")
    os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY
    # Deployment names you created in Azure OpenAI Studio
    AZURE_OPENAI_DEPLOYMENT = prompt_if_missing("AZURE_OPENAI_DEPLOYMENT", "Chat deployment name (e.g. gpt-4o-mini): ")
    AZURE_OPENAI_EMBED_DEPLOYMENT = prompt_if_missing("AZURE_OPENAI_EMBED_DEPLOYMENT", "Embedding deployment name (e.g. text-embedding-3-small): ")
    CHAT_MODEL = AZURE_OPENAI_DEPLOYMENT
    EMBED_MODEL = AZURE_OPENAI_EMBED_DEPLOYMENT

print("✅ Config complete.")


🔑 Using OpenAI (platform.openai.com)
✅ Config complete.


## 3) Helpers to load docs and chunk text

In [7]:

import re
from pathlib import Path
from pypdf import PdfReader

def load_texts(dirpath: str):
    """Yield (doc_id, text) for .txt, .md, .pdf files."""
    p = Path(dirpath)
    for fp in sorted(p.glob("*")):
        if fp.suffix.lower() in [".txt", ".md"]:
            text = fp.read_text(errors="ignore")
            yield fp.name, text
        elif fp.suffix.lower() == ".pdf":
            try:
                reader = PdfReader(str(fp))
                pages = [page.extract_text() or "" for page in reader.pages]
                yield fp.name, "\n".join(pages)
            except Exception as e:
                print(f"[warn] Could not read {fp.name}: {e}")

def chunk(text: str, max_tokens: int = 400):
    """Very rough chunking by double-newlines."""
    text = re.sub(r"\n{3,}", "\n\n", text)
    parts = text.split("\n\n")
    buf, count = [], 0
    for part in parts:
        tokens = part.split()
        if count + len(tokens) > max_tokens and buf:
            yield " ".join(buf)
            buf, count = [], 0
        buf.append(part)
        count += len(tokens)
    if buf:
        yield " ".join(buf)

print("✅ Utils ready.")


✅ Utils ready.


## 4) Build the FAISS index from `data/`

In [8]:

import numpy as np
import faiss, pickle
from tqdm import tqdm

# Choose client based on provider
if PROVIDER == "openai":
    from openai import OpenAI
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
else:
    from openai import AzureOpenAI
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    )

records = []
for doc_id, text in load_texts("data"):
    for c in chunk(text, max_tokens=400):
        c = c.strip()
        if c:
            records.append({"doc_id": doc_id, "text": c})

print(f"Loaded {len(records)} chunks. Embedding…")

vectors = []
BATCH = 64
for i in tqdm(range(0, len(records), BATCH)):
    batch = [r["text"] for r in records[i:i+BATCH]]
    resp = client.embeddings.create(model=EMBED_MODEL, input=batch)
    for d in resp.data:
        vectors.append(d.embedding)

xb = np.array(vectors, dtype="float32")
index = faiss.IndexFlatL2(xb.shape[1])
index.add(xb)

# Optionally save to disk
Path("index").mkdir(exist_ok=True)
faiss.write_index(index, "index/faiss.index")
with open("index/docs.pkl", "wb") as f:
    pickle.dump(records, f)

print("✅ Index built and saved to ./index")


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Loaded 5 chunks. Embedding…


  0%|          | 0/1 [00:03<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

## 5) Ask questions with RAG

In [None]:

def embed_query(q: str):
    e = client.embeddings.create(model=EMBED_MODEL, input=[q]).data[0].embedding
    return np.array(e, dtype="float32").reshape(1, -1)

def retrieve(query: str, k=4):
    qv = embed_query(query)
    D, I = index.search(qv, k)
    return [records[i]["text"] for i in I[0]]

def ask_llm(query: str, contexts):
    system = (
        "You are the user's personal codex. "
        "Answer using the retrieved context when relevant; be specific and truthful."
    )
    context_block = "\n\n".join([f"[Context {i+1}]\n{c}" for i, c in enumerate(contexts)])
    messages = [
        {"role":"system","content":system},
        {"role":"user","content": f"Context:\n{context_block}\n\nQuestion: {query}"}
    ]
    resp = client.chat.completions.create(model=CHAT_MODEL, messages=messages)
    return resp.choices[0].message.content

print("✅ RAG helpers ready.")


### Try it

In [None]:

question = "What kind of engineer am I, and what projects am I proud of?"
ctx = retrieve(question, k=4)
answer = ask_llm(question, ctx)
print("---- ANSWER ----\n", answer)
print("\n---- CONTEXT SNIPPETS ----")
for i, c in enumerate(ctx, 1):
    print(f"\n[Chunk {i}]\n{c[:700]}...")
