# Environmetn Setup

In [None]:
!pip install -U "langchain[openai]" langchain-core langgraph langchain-text-splitters langchain_community

In [None]:
!pip install faiss-cpu jq

In [None]:
!pip install -qU "langchain-chroma>=0.1.2"

In [None]:
from openai import OpenAI
import json
import faiss
import re
import getpass
import os
import time

In [None]:

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain.chains import create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from typing import Optional, List


In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
os.environ["OPENAI_API_KEY"]=''
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [None]:
def metadata_func(example: dict, _: dict) -> dict:
    return {
        "question_text": example.get("question_text"),
        "Title": example.get("title", "Untitled")
    }

In [None]:
def metadata_func_2(example: dict, _: dict) -> dict:
    return {
        "question_text": example.get("question_text"),
        "gold_answer": example.get("gold_answer"),
        "Title": example.get("title", "Untitled")
    }

In [None]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_func
  )
  documents=loader.load()

  return documents

In [None]:
def load_documents_2(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_func_2
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [None]:
def create_faiss_vec_store(elemnts_to_emb, folder_name):
  vectorstore=FAISS.from_documents(elemnts_to_emb,embedding=OpenAIEmbeddings())
  vectorstore.save_local(folder_name)
  return vectorstore

## Retriver Helpers

In [None]:
# retrieve the top K contents
def retrieve_section(in_retriever,query,top_k):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None
  top_5_match=results[:top_k]

  return top_5_match


In [None]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query,top_k):
  print(f"Retrieving answer for query: {in_query}")
  relevant_sections=retrieve_section(in_retriever,in_query,top_k)
  return relevant_sections

# Baseline Chunking & Save to Vector


In [None]:
#Vector stpre folder path
#Recheck pwd
!ls
baseline_vector_folder = './Baseline_vector'

In [None]:
file_path="gold_test_file_30.json"
test_documents = load_documents(file_path)
print(f"{len(test_documents)} Documents")

30 Documents


In [None]:
file_path="gold_test_file_30.json"
test_documents_gold = load_documents_2(file_path)
print(f"{len(test_documents)} Documents")

30 Documents


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import shutil
import time

In [None]:
def baseline_chunk(documents):

  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=20,
    add_start_index=True
  )
  all_splits = text_splitter.split_documents(documents)

  return all_splits

In [None]:
def base_process_document(doc):


  chunks = baseline_chunk(doc)
  print(f"Split document into {len(chunks)} sub-documents.")

  # Embed and save to Chroma DB
  if os.path.exists(baseline_vector_folder):
    shutil.rmtree(baseline_vector_folder)
    time.sleep(5)

  embeddings = OpenAIEmbeddings()
  base_vectorstore = Chroma.from_documents(documents=chunks,
                                      embedding=embeddings,
                                      persist_directory=baseline_vector_folder)

  print(f"{len(doc)} documents {len(chunks)} chunks sucessfully processed and saved to {baseline_vector_folder}")

  return base_vectorstore



In [None]:
start = time.time()
base_vectorstore = base_process_document(test_documents)
end = time.time()

print(f"Chunking&Store took {end - start:.4f} seconds to run.")

Split document into 5901 sub-documents.
30 documents 5901 chunks sucessfully processed and saved to ./Baseline_vector
Chunking&Store took 45.4330 seconds to run.


## Retrieve & Generate


In [None]:
embeddings = OpenAIEmbeddings()
base_vectorstore = Chroma(
    persist_directory=baseline_vector_folder,
    embedding_function=embeddings
)
#Verify local load result
total_docs = base_vectorstore._collection.count()
if total_docs > 0:
    print(f"Vectorstore contains {total_docs} documents")
else:
    print("Vectorstore is empty")


Vectorstore contains 5901 documents


In [None]:
top_k = 10
temp=0.2
topp=0.9
base_retriever=base_vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":top_k})

In [None]:
def get_API_response(client,sys_prompt,user_prompt,temp,topp):
  completion=client.chat.completions.create(
      model="gpt-4o",
      temperature=temp,
      top_p=topp,
      messages=[
          {"role":"system","content":sys_prompt},
          {"role":"user","content":user_prompt}
      ],
  )
  response=completion.choices[0].message.content
  return response

In [None]:
def retrieve_page_content(query):
  results=base_retriever.get_relevant_documents(query)
  if not results:
    return None

  # print(results)

  return results

In [None]:
def build_question_to_gold_answer_map(in_documents):

    question_to_gold = {}

    for doc in in_documents:
        question = doc.metadata.get("question_text", "").strip()
        gold = doc.metadata.get("gold_answer", {})
        question_to_gold[question] = gold

    return question_to_gold


In [None]:
def RAG_base(questions,question_to_gold_map,mode='test'):

  answer_NQ_client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

  run_results = []

  for i in range(len(questions)):
    question_curr = questions[i]
    # if question_curr != 'which olsen twin was in full house more':

    #     continue
    print(f"============={i+1} Question:{question_curr}=============")

    relevant_content = retrieve_page_content(question_curr)

    #Convert document into text before pass into the prompt!
    relevant_content = [doc.page_content for doc in relevant_content]
    relevant_content_text = "\n\n".join(relevant_content)

    sys_prompt=""
    user_prompt=f"""
    Answer the question **directly and concisely** using only the provided context.
      - Do not repeat the question.
      - Do not include information not in the context.
      - If the answer is unclear or not found, say 'I don't have the answer.'

    Question: {question_curr}
    Relevant contents:{relevant_content_text}
    """
    # print(user_prompt)
    response=get_API_response(answer_NQ_client,sys_prompt,user_prompt,temp,topp)

    print('-------Final Answer:-------------')
    print(f"Question: {question_curr}\n Response: {response}")

    #save to output file
    gold_answer = question_to_gold_map.get(question_curr.strip(), "")
    run_results.append({
      "input_question": question_curr,
      "retrieved_contexts": relevant_content,
      "response": response,
      "gold_answer": gold_answer
    })

    if mode == 'test':
      break
    else:
      continue

  return run_results


In [None]:
test_questions = [
    eachDoc.metadata["question_text"]
    for eachDoc in test_documents
    if "question_text" in eachDoc.metadata
]

print(test_questions)

['who sang the theme tune to absolutely fabulous', 'where did the annual wife carrying world championships take place', 'when was figure skating introduced to the olympics', 'who sings in the eye of the storm', 'who won the primary for governor of illinois', 'where did they film american horror story coven', 'what is the main job of the pharynx', "what film has the song don't you forget about me", 'where is archangel raphael mentioned in the bible', 'who sings the theme song to one tree hill', 'who fired the first shot of the civil war at fort sumter', 'who plays amy on the secret life of an american teenager', 'when is the second sound of the heartbeat produced', 'who plays chuck on the tv show chuck', 'which olsen twin was in full house more', 'how much money does argentina make from tourism', 'who did the french revolt against in 1789', 'who is the kicker for the new york giants', 'who played the mom in lost in space 2018', 'who recorded the song do you love me', 'how is the energy 

In [None]:
question_to_gold_map = build_question_to_gold_answer_map(test_documents_gold)

start = time.time()
baseline_run_results = RAG_base(test_questions,question_to_gold_map,mode='prod')
end = time.time()

print(f"RAG_base took {end - start:.4f} seconds to run.")



  results=base_retriever.get_relevant_documents(query)


[Document(id='37356ebd-9f87-4df9-9362-5c2896c16412', metadata={'Title': 'Untitled', 'question_text': 'who sang the theme tune to absolutely fabulous', 'start_index': 21026}, page_content="Dickson Wright </Li> </Ul> <H2> Theme song ( edit ) </H2> <P> The theme song for Absolutely Fabulous is `` This Wheel 's on Fire '' , written by Bob Dylan and Rick Danko and performed by Julie Driscoll and Saunders ' husband Adrian Edmondson . The song"), Document(id='d5e058b7-f426-474b-834d-52b081c7ee19', metadata={'Title': 'Untitled', 'question_text': 'who sang the theme tune to absolutely fabulous', 'start_index': 22231}, page_content="theme song , in 1994 , Pet Shop Boys recorded a song for Comic Relief using excerpts of dialogue from the series put to dance music . The single was attributed to `` Absolutely Fabulous produced by Pet Shop Boys '' . It peaked at number 6 in the UK"), Document(id='d0c0e10c-1be5-47e3-9ac3-b5c02857e6d8', metadata={'Title': 'Untitled', 'question_text': 'who sang the the

In [None]:
def save_test_output(run_results):
  output_path = "./evaluation/run_results_baseline.json"

  with open(output_path, "w", encoding="utf-8") as f:
      json.dump(run_results, f, indent=4, ensure_ascii=False)

  print(f"Saved {len(run_results)} results to {output_path}")

In [None]:
save_test_output(baseline_run_results)