In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
def parse_metadata_from_filename(filename):
    # Example: "Microsoft-2024-Annual-Report.pdf"
    name = filename.replace(".pdf", "")
    parts = name.split("-")

    company = parts[0]                # Microsoft
    year = int(parts[1])              # 2024
    doctype = "-".join(parts[2:])     # Annual-Report

    return company, year, doctype


def load_all_pdfs(folder="../knowledge_base"):
    all_docs = []

    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder, file)
            print("Loading:", pdf_path)

            # Extract structured metadata from filename
            company, year, doctype = parse_metadata_from_filename(file)

            # Load PDF pages as documents
            loader = PyMuPDFLoader(pdf_path)
            docs = loader.load()

            for d in docs:
                # ‚ùó ERASE ANY EXISTING METADATA
                d.metadata = {}

                # Inject your clean, structured metadata
                d.metadata["source"] = file
                d.metadata["company"] = company
                d.metadata["year"] = year
                d.metadata["doctype"] = doctype
                d.metadata["page"] = d.metadata.get("page", None)   # preserve original page if needed

            all_docs.extend(docs)

    return all_docs


In [3]:
docs = load_all_pdfs("../knowledge_base")

Loading: ../knowledge_base\Amazon-2024-Annual-Report.pdf
Loading: ../knowledge_base\Apple-2024-Annual-Report.pdf
Loading: ../knowledge_base\Meta-2024-Annual-Report.pdf
Loading: ../knowledge_base\Microsoft-2024-Annual-Report.pdf
Loading: ../knowledge_base\NVIDIA-2024-Annual-Report.pdf


In [4]:
for i, doc in enumerate(docs[5:]):
    print(f"--- Document {i} ---")
    print(doc.metadata)
    print()

--- Document 0 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 1 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 2 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 3 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 4 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 5 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 6 ---
{'source': 'Amazon-2024-Annual-Report.pdf', 'company': 'Amazon', 'year': 2024, 'doctype': 'Annual-Report', 'page': None}

--- Document 

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
)

def chunk_docs(docs):
    return splitter.split_documents(docs)

In [6]:
embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=os.getenv("MY_OPENAI_API_KEY")
)

In [7]:
def build_faiss(chunks, save_dir="faiss_index"):
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    vectorstore.save_local(save_dir)
    print("FAISS index saved at:", save_dir)
    return vectorstore


In [8]:
chunks = chunk_docs(docs)

vectorstore = build_faiss(chunks, "../faiss_index")

print("Ingestion complete. Chunks:", len(chunks))

FAISS index saved at: ../faiss_index
Ingestion complete. Chunks: 2832
