In [2]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
def load_all_pdfs(folder="knowledge_base"):
    all_docs = []
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder, file)
            print("Loading:", pdf_path)
            loader = PyMuPDFLoader(pdf_path)
            docs = loader.load()
            all_docs.extend(docs)
    return all_docs


In [None]:
docs = load_all_pdfs("../knowledge_base")

Loading: ../knowledge_base\Amazon-2024-Annual-Report.pdf
Loading: ../knowledge_base\Apple-2024-Annual-Report.pdf
Loading: ../knowledge_base\Meta-2024-Annual-Report.pdf
Loading: ../knowledge_base\Microsoft-2024-Annual-Report.pdf
Loading: ../knowledge_base\NVIDIA-2024-Annual-Report.pdf


In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
)

def chunk_docs(docs):
    return splitter.split_documents(docs)

In [13]:
embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=os.getenv("MY_OPENAI_API_KEY")
)

In [14]:
def build_faiss(chunks, save_dir="faiss_index"):
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    vectorstore.save_local(save_dir)
    print("FAISS index saved at:", save_dir)
    return vectorstore


In [None]:
chunks = chunk_docs(docs)

vectorstore = build_faiss(chunks, "../faiss_index")

print("Ingestion complete. Chunks:", len(chunks))

FAISS index saved at: faiss_index
Ingestion complete. Chunks: 2832
