# Environmetn Setup

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU "langchain[openai]" # select chat model OpenAI
!pip install -U langchain langchain-core langchain-community

In [None]:
!pip install -qU langchain-openai # select embeddings model OpenAI
!pip install -qU langchain-community # select vector store FAISS
!pip install jq
!pip install faiss-cpu

In [172]:
from openai import OpenAI
import json
import faiss
import re
import getpass
import os

In [173]:

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain.chains import create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from typing import Optional, List


In [174]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [175]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [176]:
def metadata_fuc(record:dict,metadata:dict)->dict:
  questions=record.get("questions",[])
  metadata["question_texts"]=[q.get("question_text") for q in questions]
  metadata["document_url"]=record.get("document_url")
  metadata["Title"] = record.get("title", "Untitled Document")
  return metadata

In [177]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".",
    content_key="document_text",
    metadata_func=metadata_fuc
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [178]:
def create_faiss_vec_store(elemnts_to_emb, folder_name):
  vectorstore=FAISS.from_documents(elemnts_to_emb,embedding=OpenAIEmbeddings())
  vectorstore.save_local(folder_name)
  return vectorstore

## Retriver Helpers

In [179]:
# retrieve the top K contents
def retrieve_section(in_retriever,query,top_k):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None
  top_5_match=results[:top_k]
  # for i in range(5):
  #   print(f"No.{i+1} chunk: {top_5_match[i]}")

  return top_5_match


In [180]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query,top_k):
  print(f"Retrieving answer for query: {in_query}")
  relevant_sections=retrieve_section(in_retriever,in_query,top_k)
  return relevant_sections


## Level 1 Helpers

**Document Chunking**

In [181]:
def get_element_chunk(split_header_list,doc_to_chunk):

  #Wrap the the original HTML content in a temporary Document object
  html_doc = Document(page_content=doc_to_chunk.page_content, metadata=doc_to_chunk.metadata)

  #Only split the HTML part
  html_splitter = HTMLSectionSplitter(headers_to_split_on=split_header_list)
  elements_chunked = html_splitter.split_documents([html_doc])
  return elements_chunked


## L2 Proposition Helpers

Reference:https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb


 The proposition promt come from: https://smith.langchain.com/hub/wfh/proposal-indexing?organizationId=50995362-9ea0-4378-ad97-b4edae2f9f22


### Setup proposition models

In [182]:
class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)

prompt = PromptTemplate.from_template("""
Decompose the Passage below into clear and simple propositions, ensuring they are interpretable out of
context.
Note:  **If you think the input cannot be break down into proposition, provide an empty return list, don't provide an error output**
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.
4. Present the results as a list of strings, formatted in JSON.


Passage:
{input}

Example:

Input: Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content:
The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in
1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in
other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were
frequently seen in gardens in spring, and thus may have served as a convenient explanation for the
origin of the colored eggs hidden there for children. Alternatively, there is a European tradition
that hares laid eggs, since a hare’s scratch or form and a lapwing’s nest look very similar, and
both occur on grassland and are first seen in the spring. In the nineteenth century the influence
of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.
German immigrants then exported the custom to Britain and America where it evolved into the
Easter Bunny."
Output: [ "The earliest evidence for the Easter Hare was recorded in south-west Germany in
1678 by Georg Franck von Franckenau.", "Georg Franck von Franckenau was a professor of
medicine.", "The evidence for the Easter Hare remained unknown in other parts of Germany until
the 18th century.", "Richard Sermon was a scholar.", "Richard Sermon writes a hypothesis about
the possible explanation for the connection between hares and the tradition during Easter", "Hares
were frequently seen in gardens in spring.", "Hares may have served as a convenient explanation
for the origin of the colored eggs hidden in gardens for children.", "There is a European tradition
that hares laid eggs.", "A hare’s scratch or form and a lapwing’s nest look very similar.", "Both
hares and lapwing’s nests occur on grassland and are first seen in the spring.", "In the nineteenth
century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular
throughout Europe.", "German immigrants exported the custom of the Easter Hare/Rabbit to
Britain and America.", "The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in
Britain and America."]

Return the result in the following JSON format :
{{"sentences": ["sentence1", "sentence2", ...]}}

Important: If you cannot extract any propositions from the input, return only an empty list like this: []. Do not return any explanation, message, or error. You must always return a valid JSON list of strings — even if it’s empty

""")



In [183]:
#Put propositioned result into a new doc
def get_new_prop_doc(relevant_sections, prop_results):
  proposition_docs = []
  for original_doc, result in zip(relevant_sections, prop_results):
      for sentence in result.sentences:
        proposition_docs.append(Document(
              page_content=sentence,
              metadata=original_doc.metadata  # retain the original metadata
          ))
  return proposition_docs

# Main RAG end to end: Generation


In [184]:
#Vector stpre folder path
#Recheck pwd
!ls
L1_vector_folder = 'L1_vector'
L2_vector_folder = 'L2_vector_prop'

L1_vector	Proposition_Complete.ipynb  test_single_doc.json
L2_vector_prop	Proposition_Sample.ipynb


In [185]:
file_path="test_single_doc.json"
test_documents = load_documents(file_path)

## Step 1: L1 basic chunking

Chunk all documents in the input json and save into vector database

In [186]:
def L1_process_document(doc):
  headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
  ]

  all_chunks = []
  for idx,eachDoc in enumerate(doc):
      chunks = get_element_chunk(headers_to_split_on, eachDoc)
      all_chunks.extend(chunks)
      print(f"Split document {idx+1} into {len(chunks)} sub-documents.")
      print(f"Example 1: {chunks[1]}")

  #Embed and Vector store
  L1_vectorstore=create_faiss_vec_store(all_chunks,L1_vector_folder)

  print(f"{len(doc)} documents sucessfully processed and saved to {L1_vector_folder}")

  return L1_vectorstore



In [None]:
L1_vectorstore = L1_process_document(test_documents)

#Step 2: When user input a query, start L1 Retrival + L2 Chunking

In [188]:
#Collect All questions from All documents
test_questions = [
    q for doc in test_documents
    for q in doc.metadata.get("question_texts", [])
]
print(test_questions)

['when did the new maze runner movie come out', 'when is the death cure coming out in ireland', 'when is the death cure released in the uk']


In [189]:
#Get L1 retriever
L1_retriever=L1_vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":10})

In [190]:
#L2 Model setup
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo', openai_api_key =  os.environ["OPENAI_API_KEY"])
L2_runnable = prompt | llm | parser



In [191]:
# call remaining processing part + print result
def get_retrival_result(in_query,top_k,result_top_k):
  L2_result = L1_Retrival_L2_Complete(in_query,top_k,result_top_k)
  print(f"--L2 Retrieved {len(L2_result)} relavant sections")
  return L2_result

In [192]:
#The main funciton that complete L1 retrival + L2 chunking&retrieval
def L1_Retrival_L2_Complete(in_query,top_k,result_top_k):

  #Step 1: Get L1 relavant section based on the input question
  L1_relevant_sections=get_retrieve_section(L1_retriever,in_query,top_k)
  print(f"--L1 Retrieved {len(L1_relevant_sections)} relavant sections")
  L1_relevant_sections_texts = [doc.page_content for doc in L1_relevant_sections]
  print(L1_relevant_sections_texts)

  # Step 2: Start L2 based on L1 result
  #Run Proposition Model
  print(f"Running Proposition...")
  L2_prop_results = [L2_runnable.invoke({"input": text}) for text in L1_relevant_sections_texts]
  if not L2_prop_results:
    return 'No Found'
  proposition_docs = get_new_prop_doc(L1_relevant_sections, L2_prop_results)
  print(f"{len(proposition_docs)} proposition docs to be embeded")

  if len(proposition_docs)>1:

  #Save L2 result to vector database
    L2_vectorstore=create_faiss_vec_store(proposition_docs,L2_vector_folder)
    L2_retriever=L2_vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":10})

    #L2 Retrieval
    L2_relevant_sections=get_retrieve_section(L2_retriever,in_query,result_top_k)
    L2_relevant_sec_text = [eachL2Doc.page_content for eachL2Doc in L2_relevant_sections]
  else:
    return 'No Found'

  return L2_relevant_sec_text


In [193]:
def get_API_response(client,sys_prompt,user_prompt,temp,topp):
  completion=client.chat.completions.create(
      model="gpt-4o",
      temperature=temp,
      top_p=topp,
      messages=[
          {"role":"system","content":sys_prompt},
          {"role":"user","content":user_prompt}
      ],
  )
  response=completion.choices[0].message.content
  return response

In [194]:
def RAG_generation(questions,top_k,result_top_k):
  answer_NQ_client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
  temp=1.0
  topp=1.0

  for i in range(len(questions)):
    print(f"=============Question:{questions[i]}=============")
    relevant_content = get_retrival_result(questions[i],top_k,result_top_k)
    sys_prompt=""
    user_prompt=f"""
    Answer the question based on the relevent contents.
    Question: {questions[i]}
    Relevant contents:{relevant_content}
    """
    response=get_API_response(answer_NQ_client,sys_prompt,user_prompt,temp,topp)
    print('-------Final Answer:-------------')
    print(f"Question: {questions[i]}\n Response: {response}")

In [195]:
top_k =3
result_top_k =3
RAG_generation(test_questions,top_k,result_top_k)

Retrieving answer for query: when did the new maze runner movie come out
--L1 Retrieved 3 relavant sections
['Production ( edit )     In March 2015 , it was confirmed that T.S. Nowlin , who co-wrote the first and wrote the second film , would adapt Maze Runner : The Death Cure . On September 16 , 2015 , it was confirmed that Ball would return to direct the final film .', "Maze Runner : the Death Cure   Jump to : navigation , search For the book the film is based on , see The Death Cure .       Maze Runner : The Death Cure   \n      Theatrical release poster   \n      Directed by     Wes Ball         Produced by         Ellen Goldsmith - Vein     Wyck Godfrey     Marty Bowen     Joe Hartwick , Jr .     Wes Ball     Lee Stollman             Screenplay by     T.S. Nowlin         Based on     The Death Cure by James Dashner         Starring         Dylan O'Brien     Kaya Scodelario     Thomas Brodie - Sangster     Nathalie Emmanuel     Giancarlo Esposito     Aidan Gillen     Walton Goggins