In [9]:
import os
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv
from langchain_community.retrievers import BM25Retriever

load_dotenv()

True

In [2]:
embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=os.getenv("MY_OPENAI_API_KEY")
)


In [3]:
def load_faiss_index(index_path="faiss_index"):
    vectorstore = FAISS.load_local(
        index_path,
        embeddings=embedding,
        allow_dangerous_deserialization=True
    )
    print("FAISS index loaded.")
    return vectorstore

In [5]:
vectorstore = load_faiss_index("../faiss_index")

FAISS index loaded.


In [10]:
all_docs = list(vectorstore.docstore._dict.values())

In [12]:
bm25 = BM25Retriever.from_documents(all_docs)
bm25.k = 10    

In [19]:
def semantic_retrieve(query, k=10):
    return vectorstore.similarity_search(query, k=k)

In [17]:
def retrieve_bm25_and_semantic(query, k_bm25=10, k_sem=10):
    # 1) keyword results
    bm25_docs = bm25.invoke(query)

    
    # 2) semantic vector results
    semantic_docs = semantic_retrieve(query, k=k_sem)
    
    print(f"BM25 retrieved: {len(bm25_docs)} chunks")
    print(f"Semantic retrieved: {len(semantic_docs)} chunks")

    return {
        "bm25": bm25_docs,
        "semantic": semantic_docs
    }


In [13]:
def show(results):
    for i, doc in enumerate(results):
        print(f"\n----- Chunk {i+1} -----")
        print("Source:", doc.metadata["source"])
        print("Page:", doc.metadata["page"])
        print(doc.page_content[:300], "...")


In [20]:
query = "Explain Microsoft's cloud revenue growth in 2024."
results = retrieve_bm25_and_semantic(query)

bm25_results = results["bm25"]
semantic_results = results["semantic"]


BM25 retrieved: 10 chunks
Semantic retrieved: 10 chunks


In [21]:
query = "Explain Microsoft's cloud revenue growth in 2024."
results = retrieve_bm25_and_semantic(query)

bm25_results = results["bm25"]
semantic_results = results["semantic"]

BM25 retrieved: 10 chunks
Semantic retrieved: 10 chunks


In [22]:
show(bm25_results)
show(semantic_results)


----- Chunk 1 -----
Source: ../knowledge_base\Microsoft-2024-Annual-Report.pdf
Page: 26
supporting and investing in our cloud-based services, including datacenter operations; designing, manufacturing, marketing, 
and selling our other products and services; and income taxes.  
Highlights from fiscal year 2024 compared with fiscal year 2023 included:  
• 
Microsoft Cloud revenue increas ...

----- Chunk 2 -----
Source: ../knowledge_base\Microsoft-2024-Annual-Report.pdf
Page: 32
32 
Intelligent Cloud  
Revenue increased $17.5 billion or 20%.  
• 
Server products and cloud services revenue increased $17.8 billion or 22% driven by Azure and other cloud 
services. Azure and other cloud services revenue grew 30% driven by growth in our consumption-based 
services. Server produc ...

----- Chunk 3 -----
Source: ../knowledge_base\Microsoft-2024-Annual-Report.pdf
Page: 31
37,884   
31%  
More Personal Computing 
 
19,309   
16,450   
17%  
  
  
  
Total 
$ 
109,433  $ 
88,523   
24%  
 
  
  