# Environmetn Setup

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU "langchain[openai]" # select chat model OpenAI
!pip install -U langchain langchain-core langchain-community

In [None]:
!pip install -qU langchain-openai # select embeddings model OpenAI
!pip install -qU langchain-community # select vector store FAISS
!pip install jq
!pip install faiss-cpu

In [3]:
from openai import OpenAI
import json
import faiss
import re
import getpass
import os

In [None]:

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain.chains import create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from typing import Optional, List


In [5]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [6]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Mounted at /content/drive
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [33]:
def metadata_func(example: dict, _: dict) -> dict:
    return {
        "document_url": example.get("document_url"),
        "question_text": example.get("question_text"),
        "annotations": example.get("annotations"),
        "example_id": example.get("example_id"),
        "gold_answer": example.get("gold_answer"),
        "Title": example.get("title", "Untitled")
    }

In [34]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_func
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [35]:
def create_faiss_vec_store(elemnts_to_emb, folder_name):
  vectorstore=FAISS.from_documents(elemnts_to_emb,embedding=OpenAIEmbeddings())
  vectorstore.save_local(folder_name)
  return vectorstore

## Retriver Helpers

In [36]:
# retrieve the top K contents
def retrieve_section(in_retriever,query,top_k):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None
  top_5_match=results[:top_k]
  # for i in range(5):
  #   print(f"No.{i+1} chunk: {top_5_match[i]}")

  return top_5_match


In [11]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query,top_k):
  print(f"Retrieving answer for query: {in_query}")
  relevant_sections=retrieve_section(in_retriever,in_query,top_k)
  return relevant_sections


## Level 1 Helpers

**Document Chunking**

In [37]:
def get_element_chunk(split_header_list,doc_to_chunk):

  #Wrap the the original HTML content in a temporary Document object
  html_doc = Document(page_content=doc_to_chunk.page_content, metadata=doc_to_chunk.metadata)


  #Only split the HTML part
  html_splitter = HTMLSectionSplitter(headers_to_split_on=split_header_list)
  elements_chunked = html_splitter.split_documents([html_doc])
  return elements_chunked


# Level 1 Chunking & Save to Vector


In [38]:
#Vector stpre folder path
#Recheck pwd
!ls
L1_vector_folder = 'L1_vector_test'

gold_test_file_30.json	L1_vector_test		    Proposition_Sample.ipynb
L1_Processing.ipynb	L2_vector_prop		    rag_sw_ver2.ipynb
L1_vector		Proposition_Complete.ipynb  test_single_doc.json


In [39]:
file_path="gold_test_file_30.json"
test_documents = load_documents(file_path)
print(f"{len(test_documents)} Documents")

30 Documents


## Step 1: L1 basic chunking

Chunk all documents in the input json and save into vector database

In [46]:
def L1_process_document(doc):
  headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
  ]

  all_chunks = []
  for idx,eachDoc in enumerate(doc):
      chunks = get_element_chunk(headers_to_split_on, eachDoc)
      all_chunks.extend(chunks)
      print(f"Split document {idx+1} into {len(chunks)} sub-documents.")
      #print(f"Example 1: {chunks[1]}")

  #Embed and Vector store
  L1_vectorstore=create_faiss_vec_store(all_chunks,L1_vector_folder)

  print(f"{len(doc)} documents sucessfully processed and saved to {L1_vector_folder}")

  return L1_vectorstore



In [47]:
L1_vectorstore = L1_process_document(test_documents)

Split document 1 into 14 sub-documents.
Split document 2 into 13 sub-documents.
Split document 3 into 18 sub-documents.
Split document 4 into 21 sub-documents.
Split document 5 into 14 sub-documents.
Split document 6 into 20 sub-documents.
Split document 7 into 26 sub-documents.
Split document 8 into 12 sub-documents.
Split document 9 into 25 sub-documents.
Split document 10 into 19 sub-documents.
Split document 11 into 42 sub-documents.
Split document 12 into 16 sub-documents.
Split document 13 into 8 sub-documents.
Split document 14 into 29 sub-documents.
Split document 15 into 43 sub-documents.
Split document 16 into 37 sub-documents.
Split document 17 into 27 sub-documents.
Split document 18 into 78 sub-documents.
Split document 19 into 18 sub-documents.
Split document 20 into 18 sub-documents.
Split document 21 into 15 sub-documents.
Split document 22 into 29 sub-documents.
Split document 23 into 37 sub-documents.
Split document 24 into 58 sub-documents.
Split document 25 into 12 