# Environmetn Setup

In [None]:
!pip install -U "langchain[openai]" langchain-core langgraph langchain-text-splitters langchain_community

In [None]:
!pip install faiss-cpu jq

In [None]:
!pip install -qU "langchain-chroma>=0.1.2"

In [84]:
# External libraries
import json
import faiss
import re
import getpass
import os
from datetime import datetime
import time
import ast

In [117]:
# OpenAI SDK
from openai import OpenAI

# Langchain imports
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain, create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)

# Pydantic + Typing
from pydantic import BaseModel
from typing import Optional, List


In [15]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [16]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Mounted at /content/drive
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [17]:
def metadata_fuc(example: dict, _: dict) -> dict:
    return {
        "question_text": example.get("question_text"),
        "Title": example.get("title", "Untitled"),
        "gold_answer": example.get("gold_answer", "")
    }

In [18]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_fuc
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [19]:
def create_faiss_vec_store(elemnts_to_emb, folder_name):
  vectorstore=FAISS.from_documents(elemnts_to_emb,embedding=OpenAIEmbeddings())
  vectorstore.save_local(folder_name)
  return vectorstore

## Retriver Helpers

In [160]:
# retrieve the top K contents
def retrieve_section(in_retriever,query):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None
  # top_K_match=results[:top_k]


  return results


In [142]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query):
  relevant_sections=retrieve_section(in_retriever,in_query)
  return relevant_sections


## Level 1 Helpers

**Document Chunking**

In [22]:
def get_element_chunk(split_header_list,doc_to_chunk):

  #Wrap the the original HTML content in a temporary Document object
  html_doc = Document(page_content=doc_to_chunk.page_content, metadata=doc_to_chunk.metadata)

  #Only split the HTML part
  html_splitter = HTMLSectionSplitter(headers_to_split_on=split_header_list)
  elements_chunked = html_splitter.split_documents([html_doc])
  return elements_chunked


## L2 Proposition Helpers

Reference:https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb


 The proposition promt come from: https://smith.langchain.com/hub/wfh/proposal-indexing?organizationId=50995362-9ea0-4378-ad97-b4edae2f9f22


### Setup proposition models

In [116]:
class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)


In [120]:
proposition_examples = [
    {
        "document": "Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content: The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in 1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in other parts of Germany until the 18th century. Scholar Richard Sermon writes that \"hares were frequently seen in gardens in spring, and thus may have served as a convenient explanation for the origin of the colored eggs hidden there for children. Alternatively, there is a European tradition that hares laid eggs, since a hare’s scratch or form and a lapwing’s nest look very similar, and both occur on grassland and are first seen in the spring. In the nineteenth century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe. German immigrants then exported the custom to Britain and America where it evolved into the Easter Bunny.",
        "propositions": "['The earliest evidence for the Easter Hare was recorded in south-west Germany in 1678 by Georg Franck von Franckenau.', 'Georg Franck von Franckenau was a professor of medicine.', 'The evidence for the Easter Hare remained unknown in other parts of Germany until the 18th century.', 'Richard Sermon was a scholar.', 'Richard Sermon writes a hypothesis about the possible explanation for the connection between hares and the tradition during Easter.', 'Hares were frequently seen in gardens in spring.', 'Hares may have served as a convenient explanation for the origin of the colored eggs hidden in gardens for children.', 'There is a European tradition that hares laid eggs.', 'A hare’s scratch or form and a lapwing’s nest look very similar.', 'Both hares and lapwing’s nests occur on grassland and are first seen in the spring.', 'In the nineteenth century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.', 'German immigrants exported the custom of the Easter Hare/Rabbit to Britain and America.', 'The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in Britain and America.']"
    }
]

example_proposition_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{document}"),
        ("ai", "{propositions}"),
    ]
)
prop_few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt = example_proposition_prompt,
    examples = proposition_examples,
)


In [121]:
prop_system  = '''Decompose the Passage below into clear and simple propositions, ensuring they are interpretable out of
context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.
4. Present the results as a list of strings, formatted in JSON.

Output format:
1. Present the results as a JSON object with a key `"sentences"`, and the value should be an array of strings.
'''

proposition_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prop_system),
        prop_few_shot_prompt,
        ("human", "{document}"),
    ]
)


In [122]:
#Put propositioned result into a new doc
def get_new_prop_doc(relevant_sections, prop_results):
  proposition_docs = []
  for original_doc, result in zip(relevant_sections, prop_results):
      for sentence in result.sentences:
        proposition_docs.append(Document(
              page_content=sentence,
              metadata={}
          ))
  return proposition_docs

# Main RAG end to end: Generation


In [45]:
#Vector stpre folder path
#Recheck pwd
!ls
L1_vector_folder = 'L1_vector_test'
L2_vector_folder = 'L2_vector_prop'

 Baseline.ipynb		 'L1_Process_Chunk&Save.ipynb'	 Proposition_Light.ipynb
 Baseline_vector	  L1_vector			 Proposition_Sample.ipynb
 dense_pack		  L1_vector_test		 rag_sw_ver2.ipynb
 evaluation		  L1_vector_test_2		 rag_sw_ver3.ipynb
 Evaluation.ipynb	  L2_vector_prop		 test_single_doc.json
 gold_test_file_30.json   Proposition_Complete.ipynb


In [46]:
file_path="gold_test_file_30.json"
test_documents = load_documents(file_path)

## Step 1: Load L1 local vectorstore

In [47]:
embeddings = OpenAIEmbeddings()

In [48]:
L1_vectorstore = Chroma(
    persist_directory=L1_vector_folder,
    embedding_function=embeddings
)

In [49]:
#Verify local load result
total_docs = L1_vectorstore._collection.count()
if total_docs > 0:
    print(f"Vectorstore contains {total_docs} documents")
else:
    print("Vectorstore is empty")

Vectorstore contains 808 documents


## Step 2: When user input a query, start L1 Retrival + L2 Chunking

### variable definition

In [161]:
L1_top_k =3
result_top_k =10
temp=0.2
topp=0.9
L2_model = 'gpt-4-turbo'
#gpt-4-turbo
#gpt-3.5-turbo

#Set L1 retriever
L1_retriever=L1_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k":L1_top_k})
#L2 Model setup
llm = ChatOpenAI(model=L2_model, openai_api_key =  os.environ["OPENAI_API_KEY"])

L2_runnable = proposition_prompt | llm | parser

In [162]:
#Collect All questions from All documents
test_questions = [
    eachDoc.metadata["question_text"]
    for eachDoc in test_documents
    if "question_text" in eachDoc.metadata
]

print(test_questions)

['who sang the theme tune to absolutely fabulous', 'where did the annual wife carrying world championships take place', 'when was figure skating introduced to the olympics', 'who sings in the eye of the storm', 'who won the primary for governor of illinois', 'where did they film american horror story coven', 'what is the main job of the pharynx', "what film has the song don't you forget about me", 'where is archangel raphael mentioned in the bible', 'who sings the theme song to one tree hill', 'who fired the first shot of the civil war at fort sumter', 'who plays amy on the secret life of an american teenager', 'when is the second sound of the heartbeat produced', 'who plays chuck on the tv show chuck', 'which olsen twin was in full house more', 'how much money does argentina make from tourism', 'who did the french revolt against in 1789', 'who is the kicker for the new york giants', 'who played the mom in lost in space 2018', 'who recorded the song do you love me', 'how is the energy 

### L2 Main codes

In [190]:
# call remaining processing part + print result
def get_L2_retrival_result(in_query,top_k,result_top_k):
  L2_result = L1_Retrival_L2_Complete(in_query,result_top_k)
  print(f"--L2 Retrieved {len(L2_result)} relavant sections")
  #Debug
  # print(L2_result)
  return L2_result

In [191]:
#The main funciton that complete L1 retrival + L2 chunking&retrieval
def L1_Retrival_L2_Complete(in_query,result_top_k):

  #Step 1: Get L1 relavant section based on the input question
  L1_relevant_sections= retrieve_section(L1_retriever,in_query)
  print(f"--- L1 Retrieved {len(L1_relevant_sections)} relavant sections")
  L1_relevant_sections_texts = [doc.page_content for doc in L1_relevant_sections]

  #For Debug
  # print(L1_relevant_sections_texts)

  # Step 2: Start L2 based on L1 result
  #Run Proposition Model
  print(f"Running Proposition...")
  L2_prop_results = [L2_runnable.invoke({"document": text}) for text in L1_relevant_sections_texts]
  if not L2_prop_results:
    return 'No Found'
  # print(L2_prop_results)
  proposition_docs = get_new_prop_doc(L1_relevant_sections, L2_prop_results)
  print(f"{len(proposition_docs)} proposition docs to be embeded")
  print(proposition_docs)

  if len(proposition_docs)>1:
  #Save L2 result to vector database
    L2_vectorstore=create_faiss_vec_store(proposition_docs,L2_vector_folder)
    L2_retriever=L2_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": result_top_k} )

    #L2 Retrieval
    L2_relevant_sections=retrieve_section(L2_retriever,in_query)
    L2_relevant_sec_text = [eachL2Doc.page_content for eachL2Doc in L2_relevant_sections]
  else:
    return 'No Found'

  return L2_relevant_sec_text


In [192]:
def get_API_response(client,sys_prompt,user_prompt,temp,topp):
  completion=client.chat.completions.create(
      model="gpt-4o",
      temperature=temp,
      top_p=topp,
      messages=[
          {"role":"system","content":sys_prompt},
          {"role":"user","content":user_prompt}
      ],
  )
  response=completion.choices[0].message.content
  return response

In [193]:
def build_question_to_gold_answer_map(in_documents):

    question_to_gold = {}

    for doc in in_documents:
        question = doc.metadata.get("question_text", "").strip()
        gold = doc.metadata.get("gold_answer", {})
        question_to_gold[question] = gold

    return question_to_gold


In [194]:
def RAG_generation(questions,top_k,result_top_k,question_to_gold_map,mode='test'):
  answer_NQ_client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


  run_results = []

  for i in range(len(questions)):
    question_curr = questions[i]
    print(f"============={i+1} Question:{question_curr}=============")

    relevant_content = get_L2_retrival_result(question_curr,top_k,result_top_k)

    #Convert document into text before pass into the prompt!
    relevant_content_text = "\n\n".join(relevant_content)

    sys_prompt=""
    user_prompt=f"""
    Answer the question **directly and concisely** using only the provided context.
      - Do not repeat the question.
      - Do not include information not in the context.
      - If the answer is unclear or not found, say 'I don't have the answer.'

    Question: {question_curr}
    Relevant contents:{relevant_content_text}
    """
    # print(user_prompt)
    response=get_API_response(answer_NQ_client,sys_prompt,user_prompt,temp,topp)
    print('-------Final Answer:-------------')
    print(f"Question: {question_curr}\n Response: {response}")

    #save to output file
    gold_answer = question_to_gold_map.get(question_curr.strip(), "")
    run_results.append({
      "input_question": question_curr,
      "retrieved_contexts": relevant_content,
      "response": response,
      "gold_answer": gold_answer
    })

    if mode == 'test':
      break
    else:
      continue

  return run_results


### Run Main Proposition

In [195]:
question_to_gold_map = build_question_to_gold_answer_map(test_documents)
run_results = RAG_generation(test_questions,L1_top_k,result_top_k,question_to_gold_map,mode='test')

--- L1 Retrieved 3 relavant sections
Running Proposition...
53 proposition docs to be embeded
[Document(metadata={}, page_content="The theme song for Absolutely Fabulous is 'This Wheel's on Fire'."), Document(metadata={}, page_content="'This Wheel's on Fire' was written by Bob Dylan and Rick Danko."), Document(metadata={}, page_content="'This Wheel's on Fire' was performed by Julie Driscoll and Adrian Edmondson."), Document(metadata={}, page_content="Adrian Edmondson is Saunders' husband."), Document(metadata={}, page_content="Marianne Faithfull and P.P. Arnold also sang 'This Wheel's on Fire' for the 'Last Shout' special in 1996."), Document(metadata={}, page_content="Hermine Demoriane sang a French version of 'This Wheel's on Fire' over the closing credits of the episode 'Paris'."), Document(metadata={}, page_content="At the end of the episode 'Birthday', Edina and Patsy sang 'This Wheel's on Fire' together using a karaoke machine."), Document(metadata={}, page_content="Debbie Harry 

In [188]:
#mmr
run_results

[{'input_question': 'who sang the theme tune to absolutely fabulous',
  'retrieved_contexts': ["The theme song for 'Absolutely Fabulous' is 'This Wheel's on Fire'.",
   "In 1994, Pet Shop Boys recorded a song for Comic Relief using excerpts of dialogue from 'Absolutely Fabulous' put to dance music.",
   'Dawn French appeared on Absolutely Fabulous only once.',
   'Jon Plowman expressed excitement that Absolutely Fabulous was returning for three new shows.',
   "Absolutely Fabulous was produced by Saunders and French's production company.",
   "The theme song is missing from many of the US Region 1 DVDs of 'Absolutely Fabulous' due to copyright issues.",
   "For series four, a line from 'Ziggy Stardust' by David Bowie, 'Ziggy played guitar', played at the end of each episode.",
   "Absolutely Fabulous has no connection, other than the character's name, to the earlier film 'Eddie Monsoon: A Life?'.",
   'Saunders remembered the falls that she saw Bananarama do when she started doing Abso

In [197]:
#similarity
run_results

[{'input_question': 'who sang the theme tune to absolutely fabulous',
  'retrieved_contexts': ["The theme song for Absolutely Fabulous is 'This Wheel's on Fire'.",
   'In 1994, Pet Shop Boys recorded a song for Comic Relief using excerpts of dialogue from Absolutely Fabulous put to dance music.',
   "The single from 1994 was attributed to 'Absolutely Fabulous produced by Pet Shop Boys'.",
   'The music video for the 1994 single featured clips from Absolutely Fabulous and specially recorded footage of the Pet Shop Boys with Patsy and Edina.',
   "For series four of Absolutely Fabulous, a line from the song 'Ziggy Stardust' by David Bowie, 'Ziggy played guitar', played at the end of each episode.",
   "Absolutely Fabulous was produced by Saunders and French's production company.",
   "The first special of the new Absolutely Fabulous specials was titled 'Identity'.",
   "The theme song 'This Wheel's on Fire' is missing from many of the US Region 1 DVDs of Absolutely Fabulous.",
   'Jon Pl

In [135]:
def save_test_output(run_results):
  today = datetime.today().strftime("%Y-%m-%d")
  output_path = f"./evaluation/run_results_proposition_{L2_model}_{today}.json"

  with open(output_path, "w", encoding="utf-8") as f:
      json.dump(run_results, f, indent=4, ensure_ascii=False)

  print(f"Saved {len(run_results)} results to {output_path}")

In [136]:
save_test_output(run_results)

Saved 30 results to ./evaluation/run_results_proposition_gpt3.5_2025-04-05.json


**Note**: When run with GPT4, the response is pretty accurate.
when L1_top_k =3, result_top_k =5, the first example results in 12 proposition, and the cost is around $0.04, the result is accurate.

For 1 data sample with 40 proposition, it takes $0.01

**For GPT3.5 Turbo, 30 data samples(on average each has 40 proposition) takes $0.1**

for GPT4.0Turbo, 30 data sample take $3.6-