# Environmetn Setup

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU "langchain[openai]" # select chat model OpenAI
!pip install -U langchain langchain-core langchain-community

In [None]:
!pip install -qU langchain-openai # select embeddings model OpenAI
!pip install -qU langchain-community # select vector store FAISS
!pip install jq
!pip install faiss-cpu

In [None]:
!pip install -qU "langchain-chroma>=0.1.2"

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
from openai import OpenAI
import json
import faiss
import re
import getpass
import os

In [None]:

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain.chains import create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from typing import Optional, List


In [None]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Mounted at /content/drive
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [None]:
def metadata_fuc(example: dict, _: dict) -> dict:
    return {
        "question_text": example.get("question_text"),
        "Title": example.get("title", "Untitled"),
        "gold_answer": example.get("gold_answer", "")
    }

In [None]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_fuc
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [None]:
def create_faiss_vec_store(elemnts_to_emb, folder_name):
  vectorstore=FAISS.from_documents(elemnts_to_emb,embedding=OpenAIEmbeddings())
  vectorstore.save_local(folder_name)
  return vectorstore

## Retriver Helpers

In [None]:
# retrieve the top K contents
def retrieve_section(in_retriever,query,top_k):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None
  top_5_match=results[:top_k]
  # for i in range(5):
  #   print(f"No.{i+1} chunk: {top_5_match[i]}")

  return top_5_match


In [None]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query,top_k):
  relevant_sections=retrieve_section(in_retriever,in_query,top_k)
  return relevant_sections


## Level 1 Helpers

**Document Chunking**

In [None]:
def get_element_chunk(split_header_list,doc_to_chunk):

  #Wrap the the original HTML content in a temporary Document object
  html_doc = Document(page_content=doc_to_chunk.page_content, metadata=doc_to_chunk.metadata)

  #Only split the HTML part
  html_splitter = HTMLSectionSplitter(headers_to_split_on=split_header_list)
  elements_chunked = html_splitter.split_documents([html_doc])
  return elements_chunked


## L2 Proposition Helpers

Reference:https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb


 The proposition promt come from: https://smith.langchain.com/hub/wfh/proposal-indexing?organizationId=50995362-9ea0-4378-ad97-b4edae2f9f22


### Setup proposition models

In [None]:
class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)

prompt = PromptTemplate.from_template("""
Decompose the Passage below into clear and simple propositions, ensuring they are interpretable out of
context.
Note:  **If you think the input cannot be break down into proposition, provide an empty return list, don't provide an error output**
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.
4. Present the results as a list of strings, formatted in JSON.


Passage:
{input}

Example:

Input: Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content:
The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in
1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in
other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were
frequently seen in gardens in spring, and thus may have served as a convenient explanation for the
origin of the colored eggs hidden there for children. Alternatively, there is a European tradition
that hares laid eggs, since a hare’s scratch or form and a lapwing’s nest look very similar, and
both occur on grassland and are first seen in the spring. In the nineteenth century the influence
of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.
German immigrants then exported the custom to Britain and America where it evolved into the
Easter Bunny."
Output: [ "The earliest evidence for the Easter Hare was recorded in south-west Germany in
1678 by Georg Franck von Franckenau.", "Georg Franck von Franckenau was a professor of
medicine.", "The evidence for the Easter Hare remained unknown in other parts of Germany until
the 18th century.", "Richard Sermon was a scholar.", "Richard Sermon writes a hypothesis about
the possible explanation for the connection between hares and the tradition during Easter", "Hares
were frequently seen in gardens in spring.", "Hares may have served as a convenient explanation
for the origin of the colored eggs hidden in gardens for children.", "There is a European tradition
that hares laid eggs.", "A hare’s scratch or form and a lapwing’s nest look very similar.", "Both
hares and lapwing’s nests occur on grassland and are first seen in the spring.", "In the nineteenth
century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular
throughout Europe.", "German immigrants exported the custom of the Easter Hare/Rabbit to
Britain and America.", "The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in
Britain and America."]

Return the result in the following JSON format :
{{"sentences": ["sentence1", "sentence2", ...]}}

Important: If you cannot extract any propositions from the input, return only an empty list like this: []. Do not return any explanation, message, or error. You must always return a valid JSON list of strings — even if it’s empty

""")



In [None]:
#Put propositioned result into a new doc
def get_new_prop_doc(relevant_sections, prop_results):
  proposition_docs = []
  for original_doc, result in zip(relevant_sections, prop_results):
      for sentence in result.sentences:
        proposition_docs.append(Document(
              page_content=sentence,
              metadata=original_doc.metadata  # retain the original metadata
          ))
  return proposition_docs

# Main RAG end to end: Generation


In [None]:
#Vector stpre folder path
#Recheck pwd
!ls
L1_vector_folder = 'L1_vector_test'
L2_vector_folder = 'L2_vector_prop'

 Evaluation.ipynb	        L1_vector_test_2	     rag_sw_ver2.ipynb
 gold_test_file_30.json         L2_vector_prop		     run_results_proposition.json
'L1_Process_Chunk&Save.ipynb'   Proposition_Complete.ipynb   test_single_doc.json
 L1_vector		        Proposition_Light.ipynb
 L1_vector_test		        Proposition_Sample.ipynb


In [None]:
file_path="gold_test_file_30.json"
test_documents = load_documents(file_path)

## Step 1: Load L1 local vectorstore

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
L1_vectorstore = Chroma(
    persist_directory=L1_vector_folder,
    embedding_function=embeddings
)

In [None]:
#Verify local load result
total_docs = L1_vectorstore._collection.count()
if total_docs > 0:
    print(f"Vectorstore contains {total_docs} documents")
else:
    print("Vectorstore is empty")

Vectorstore contains 1496 documents


## Step 2: When user input a query, start L1 Retrival + L2 Chunking

In [None]:
#Collect All questions from All documents
test_questions = [
    eachDoc.metadata["question_text"]
    for eachDoc in test_documents
    if "question_text" in eachDoc.metadata
]

print(test_questions)

['what episode does olivia die in the walking dead', 'actor playing krishna in mahabharat on star plus', 'is the game show the chase still on tv', 'when was figure skating introduced to the olympics', 'setting of the ones who walk away from omelas', 'who does the voice of angela on family guy', 'how many episodes of gossip girl is there', 'who plays amy on the secret life of an american teenager', 'where is the roman forum located in rome', 'where is archangel raphael mentioned in the bible', 'who plays chuck on the tv show chuck', "who won the women's world cup championship in 2017", 'what is the southern most point in canada', 'how many episodes will prison break season 5 have', 'what does c class stand for in mercedes benz', 'how many years did it take to build the colosseum in rome', 'who is the present governor of puerto rico', 'when does episode 131 come out for dragon ball super', "what's the hottest natural pepper in the world", 'list of submissions to the 87th academy awards f

In [None]:
#Set L1 retriever
L1_retriever=L1_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k":5})

In [None]:
#L2 Model setup
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo', openai_api_key =  os.environ["OPENAI_API_KEY"])
#gpt-4-turbo
#gpt-3.5-turbo
L2_runnable = prompt | llm | parser



In [None]:
# call remaining processing part + print result
def get_L2_retrival_result(in_query,top_k,result_top_k):
  L2_result = L1_Retrival_L2_Complete(in_query,top_k,result_top_k)
  print(f"--L2 Retrieved {len(L2_result)} relavant sections")
  #Debug
  print(L2_result)
  return L2_result

In [None]:
#The main funciton that complete L1 retrival + L2 chunking&retrieval
def L1_Retrival_L2_Complete(in_query,top_k,result_top_k):

  #Step 1: Get L1 relavant section based on the input question
  L1_relevant_sections= retrieve_section(L1_retriever,in_query,top_k)
  print(f"--- L1 Retrieved {len(L1_relevant_sections)} relavant sections")
  L1_relevant_sections_texts = [doc.page_content for doc in L1_relevant_sections]

  #For Debug
  #print(L1_relevant_sections_texts)

  # Step 2: Start L2 based on L1 result
  #Run Proposition Model
  print(f"Running Proposition...")
  L2_prop_results = [L2_runnable.invoke({"input": text}) for text in L1_relevant_sections_texts]
  if not L2_prop_results:
    return 'No Found'
  proposition_docs = get_new_prop_doc(L1_relevant_sections, L2_prop_results)
  print(f"{len(proposition_docs)} proposition docs to be embeded")

  if len(proposition_docs)>1:
  #Save L2 result to vector database
    L2_vectorstore=create_faiss_vec_store(proposition_docs,L2_vector_folder)
    L2_retriever=L2_vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":result_top_k})

    #L2 Retrieval
    L2_relevant_sections=retrieve_section(L2_retriever,in_query,result_top_k)
    L2_relevant_sec_text = [eachL2Doc.page_content for eachL2Doc in L2_relevant_sections]
  else:
    return 'No Found'

  return L2_relevant_sec_text


In [None]:
def get_API_response(client,sys_prompt,user_prompt,temp,topp):
  completion=client.chat.completions.create(
      model="gpt-4o",
      temperature=temp,
      top_p=topp,
      messages=[
          {"role":"system","content":sys_prompt},
          {"role":"user","content":user_prompt}
      ],
  )
  response=completion.choices[0].message.content
  return response

In [None]:
def build_question_to_gold_answer_map(in_documents):

    question_to_gold = {}

    for doc in in_documents:
        question = doc.metadata.get("question_text", "").strip()
        gold = doc.metadata.get("gold_answer", {})
        question_to_gold[question] = gold

    return question_to_gold


In [None]:
def RAG_generation(questions,top_k,result_top_k,question_to_gold_map,mode='test'):
  answer_NQ_client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
  temp=1.0
  topp=1.0

  run_results = []

  for i in range(len(questions)):
    question_curr = questions[i]
    print(f"============={i+1} Question:{question_curr}=============")

    relevant_content = get_L2_retrival_result(question_curr,top_k,result_top_k)
    sys_prompt=""
    user_prompt=f"""
    Answer the question based on the relevent contents. If you don't know the answer, say 'I don't have the answer'
    Question: {question_curr}
    Relevant contents:{relevant_content}
    """
    response=get_API_response(answer_NQ_client,sys_prompt,user_prompt,temp,topp)
    print('-------Final Answer:-------------')
    print(f"Question: {question_curr}\n Response: {response}")

    #save to output file
    gold_answer = question_to_gold_map.get(question_curr.strip(), "")
    run_results.append({
      "input_question": question_curr,
      "retrieved_contexts": relevant_content,
      "response": response,
      "gold_answer": gold_answer
    })

    if mode == 'test':
      break
    else:
      continue

  return run_results


In [None]:
def save_test_output(run_results):
  output_path = "./evaluation/run_results_proposition_.json"

  with open(output_path, "w", encoding="utf-8") as f:
      json.dump(run_results, f, indent=4, ensure_ascii=False)

  print(f"Saved {len(run_results)} results to {output_path}")

In [None]:
L1_top_k =3
result_top_k =5
question_to_gold_map = build_question_to_gold_answer_map(test_documents)
run_results = RAG_generation(test_questions,L1_top_k,result_top_k,question_to_gold_map,mode='test')

--- L1 Retrieved 3 relavant sections
Running Proposition...
26 proposition docs to be embeded
--L2 Retrieved 5 relavant sections
['Olivia is portrayed by Ann Mahoney on The Walking Dead television series.', 'Olivia is threatened with death by Negan in the episode "Service" when the Saviors find out that two of the guns are missing from the armory.', 'Olivia is the main caretaker of Judith Grimes in this episode.', 'Olivia is a member of the supporting cast for the seventh season.', 'Arat pulls out her gun, turns around, and shoots Olivia in the face, killing her.']
-------Final Answer:-------------
Question: what episode does olivia die in the walking dead
 Response: I don't have the answer.


In [None]:
save_test_output(run_results)

Saved 1 results to run_results_proposition.json


**Note**: When run with GPT4, the response is pretty accurate.
when L1_top_k =3, result_top_k =5, the first example results in 12 proposition, and the cost is around $0.04, the result is accurate.