# RAG Pipline with Sentence Window Retrieval

## Environment Setup

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU "langchain[openai]" # select chat model OpenAI
!pip install -qU langchain-openai # select embeddings model OpenAI
!pip install -qU langchain-community # select vector store FAISS
!pip install jq
!pip install faiss-cpu
!pip install llama-index
!pip install sentence-transformers
!pip install transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.4/142.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.3/423.3 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import json
import getpass
import os
import faiss

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.schema import Document as LlamaDocument
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import PromptTemplate

In [None]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helpers

### Loading Documents

In [None]:
def metadata_fuc(record:dict,metadata:dict)->dict:
  metadata["question_text"]=record.get("question_text")
  metadata["gold_answer"]=record.get("gold_answer","")
  metadata["Title"] = record.get("title", "Untitled")
  return metadata

In [None]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_fuc
  )
  documents=loader.load()

  return documents

### Level 1 Retriver Helpers

In [None]:
# retrieve the top K contents
def retrieve_section(retriever,query,top_k):
  results=retriever.get_relevant_documents(query)
  if not results:
    return None
  top_k_match=results[:top_k]

  return top_k_match

In [None]:
#Run retriver for the input query
def get_retrieve_section(vector_store,query,top_k):
  # print(f"Retrieving answer for query: {query}")
  retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":10})
  relevant_sections=retrieve_section(retriever,query,top_k)
  # print("Retrieved Sections:")
  # for i in range(top_k):
  #   print(f"No.{i+1} Section: {relevant_sections[i]}")
  return relevant_sections

### Level 2 Sentence Window Chunking Helpers

Reference: https://www.linkedin.com/pulse/sentence-window-retrieval-optimizing-llm-performance-rutam-bhagat-v24of/

**Indexing Nodes**

Split chunks into sentences with surrounding context (metadata) and store them in the vector database

In [None]:
def create_node_index(doc_to_node, window_size,folder_name):
  doc_llama=[
    LlamaDocument(text=doc.page_content, metadata=doc.metadata)
    for doc in doc_to_node
  ]
  node_parser=SentenceWindowNodeParser.from_defaults(
  window_size=window_size,
  window_metadata_key="window",
  original_text_metadata_key="original_text"
  )

  Settings.llm = OpenAI(model="gpt-4o", temperature=0.1)
  Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")
  Settings.node_parser = node_parser

  node_index = VectorStoreIndex.from_documents(doc_llama)
  node_index.storage_context.persist(persist_dir=folder_name)

  return node_index

**query engine**  
The query engine consists of 2 components:  
1. Metadata Replacement Postprocessor: responsible for replacing the node text with the surrounding context stored in the metadata.
2. Sentence Transformer Re-ranker: re-ordering the retrieved nodes based on their relevance to the query

In [None]:
def get_query_engine(top_n,top_k,rerank_model,node_index):
  postproc= MetadataReplacementPostProcessor(
  target_metadata_key="window"
  )
  rerank=SentenceTransformerRerank(
  top_n=top_n, model=rerank_model
  )

  query_engine=node_index.as_query_engine(
    similarity_top_k=top_k,
    node_postprocessors=[postproc,rerank]
  )

  return query_engine

**Customizing prompt**  
Instead of using the template, customize the generation prompt

In [None]:
def create_customized_query_engine(top_n,top_k,rerank_model,node_index,prompt_str):
  customized_prompt=PromptTemplate(prompt_str)
  query_engine=get_query_engine(top_n,top_k,rerank_model,node_index)
  query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": customized_prompt}
  )

  return query_engine

### Other Helpers  
build a map of quesitons (key) and corresponding ground truth answers (value).

In [None]:
def build_question_to_gold_answer_map(in_documents):

    question_to_gold = {}

    for doc in in_documents:
        question = doc.metadata.get("question_text", "").strip()
        gold = doc.metadata.get("gold_answer", {})
        question_to_gold[question] = gold

    return question_to_gold

## Main RAG end to end: Generation

In [None]:
test_file_path="/content/drive/MyDrive/ECE1508/ECE1508_Project/Codes/gold_test_file_30.json"
L1_vector_path="/content/drive/MyDrive/ECE1508/L1_vector"
node_index_path="/content/drive/MyDrive/ECE1508/L2_nodes_test"
output_path="/content/drive/MyDrive/ECE1508/test_result_3.json"
rerank_model="BAAI/bge-reranker-base"
embedding_model=OpenAIEmbeddings()

top_k=3
window_size=3
top_n=10
top_k_2=15

all_results=[]

# load test set
document = load_documents(test_file_path)
# load L1 vector store
L1_vectorstore = Chroma(
    persist_directory=L1_vectorstore_path,
    embedding_function=embedding_model
)
# collect queries
test_questions=[
  eachDoc.metadata["question_text"]
  for eachDoc in document
  if "question_text" in eachDoc.metadata
]
# collect gold answers
question_to_gold_map = build_question_to_gold_answer_map(document)

# customize generation prompt (using the same generation prompt as the proposition chunking)
prompt_str=(
    "Answer the question **directly and concisely** using only the provided context.\n"
    "- Do not repeat the question.\n"
    "- Do not include information not in the context.\n"
    "- If the answer is unclear or not found, say'I don't have the answer'.\n"
    "Question:{query_str}\n"
    "Relevant contents:{context_str}\n"
    "Answer: "
)

for i, query in enumerate(test_questions):
  # level 1 retrieval
  relevant_sections=get_retrieve_section(L1_vectorstore,query,top_k)
  # level 2 chunking
  node_index=create_node_index(relevant_sections,window_size,node_index_path)
  # level 2 query engine
  sentence_window_engine=create_customized_query_engine(top_n,top_k_2,rerank_model,node_index,prompt_str)
  # level 2 retrieval and generation
  response=sentence_window_engine.query(query)
  retrieved_contexts=[node.get_content() for node in response.source_nodes]
  gold_answer = question_to_gold_map.get(query.strip(), "")
  print(f"Question: {query}")
  print(f"Response: {response.response}")
  for m,node in enumerate(response.source_nodes):
    print(f"No. {m+1} Source Node: {node.get_content()}")
  result={
    "input_question": query,
    "retrieved_contexts": retrieved_contexts,
    "response": response.response,
    "gold_answer": gold_answer
  }
  all_results.append(result)

with open(output_path, 'w', encoding='utf-8') as f:
  json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"\nResults saved to {output_path}")

Question: who sang the theme tune to absolutely fabulous
Response: I don't have the answer.
No. 1 Source Node: BBC America broadcast it in full .  Both channels aired the episode in a 40 - minute block to allow for commercial interruptions .      Absolutely Fabulous is ranked as the 17th greatest British TV show of all time by the British Film Institute .  A scene from the show was included in the 100 Greatest TV Moments programme broadcast by Channel 4 .  In 1997 , the pilot episode , `` Fashion '' , was ranked number 47 on TV Guide 's `` 100 Greatest Episodes of All - Time '' list .  In 2004 and 2007 , the series was ranked number 24 and number 29 on TV Guide 's Top Cult Shows Ever list .
No. 2 Source Node: Both channels aired the episode in a 40 - minute block to allow for commercial interruptions .      Absolutely Fabulous is ranked as the 17th greatest British TV show of all time by the British Film Institute .  A scene from the show was included in the 100 Greatest TV Moments pro