# Environmetn Setup

In [None]:
!pip install -U "langchain[openai]" langchain-core langgraph langchain-text-splitters langchain_community

In [None]:
!pip install faiss-cpu jq

In [None]:
!pip install -qU "langchain-chroma>=0.1.2"

In [None]:
# !pip install -qU langchain-openai

In [4]:
from openai import OpenAI
import json
import faiss
import re
import getpass
import os
import time
import shutil

In [None]:

from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain.chains import create_extraction_chain_pydantic
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from typing import Optional, List


In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [7]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

Enter API key for OpenAI: ··········


In [29]:
from google.colab import drive
drive.mount('/content/drive')

%cd  /content/drive/MyDrive/ECE1508_Project/Codes

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ECE1508_Project/Codes


# Helpers

## Load Test Doc

In [9]:
def metadata_func(example: dict, _: dict) -> dict:
    return {
        "question_text": example.get("question_text"),
        "Title": example.get("title", "Untitled")
    }

In [10]:
def load_documents(file_path):
  loader=JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="document_text",
    metadata_func=metadata_func
  )
  documents=loader.load()

  return documents

## Vector Store Helpers

In [23]:
# def create_faiss_vec_store(elemnts_to_emb, folder_name):
#   embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
#   vectorstore = FAISS.from_documents(elemnts_to_emb, embedding=embeddings)
#   vectorstore.save_local(folder_name)
#   return vectorstore

## Retriver Helpers

In [13]:
# retrieve the top K contents
def retrieve_section(in_retriever,query):
  results=in_retriever.get_relevant_documents(query)
  if not results:
    return None

  return results


In [14]:
#Run retriver for the input query
def get_retrieve_section(in_retriever,in_query,top_k):
  print(f"Retrieving answer for query: {in_query}")
  relevant_sections=retrieve_section(in_retriever,in_query,top_k)
  return relevant_sections


## Level 1 Helpers

**Document Chunking**

In [18]:
def get_element_chunk(split_header_list,doc_to_chunk):

  #Wrap the the original HTML content in a temporary Document object
  html_doc = Document(page_content=doc_to_chunk.page_content, metadata=doc_to_chunk.metadata)

  #Only split the HTML part
  html_splitter = HTMLSectionSplitter(headers_to_split_on=split_header_list)
  elements_chunked = html_splitter.split_documents([html_doc])
  return elements_chunked


# Level 1 Chunking & Save to Vector


In [46]:
#Vector stpre folder path
#Recheck pwd
!ls
L1_vector_folder = 'L1_vector_final'
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


 Archive		 Evaluation.ipynb	        Proposition_Light.ipynb
 Baseline.ipynb		 gold_test_file_30.json         Proposition_Sample.ipynb
 Baseline_Use_L1.ipynb	'L1_Process_Chunk&Save.ipynb'   rag_sw_ver2.ipynb
 Baseline_vector	 L1_vector		        rag_sw_ver3.ipynb
 dense_pack		 L2_vector_prop		        test_single_doc.json
 evaluation		 Proposition_Complete.ipynb


In [32]:
file_path="gold_test_file_30.json"
test_documents = load_documents(file_path)
print(f"{len(test_documents)} Documents")


30 Documents


## Step 1: L1 basic chunking

Chunk all documents in the input json and save into vector database

In [47]:
def L1_process_document(doc):
  headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
  ]

  all_chunks = []
  for idx,eachDoc in enumerate(doc):
      chunks = get_element_chunk(headers_to_split_on, eachDoc)
      all_chunks.extend(chunks)
      print(f"Split document {idx+1} into {len(chunks)} sub-documents.")
      #print(f"Example 1: {chunks[1]}")

  #Embed and Vector store
  #L1_vectorstore=create_faiss_vec_store(all_chunks,L1_vector_folder)

  # Embed and save to Chroma DB
  if os.path.exists(L1_vector_folder):
    shutil.rmtree(L1_vector_folder)
    time.sleep(1)

  L1_vectorstore = Chroma.from_documents(documents=all_chunks,
                                      embedding=embeddings ,
                                      persist_directory=L1_vector_folder)

  print(f"{len(doc)} documents {len(all_chunks)} chunks sucessfully processed and saved to {L1_vector_folder}")

  return L1_vectorstore



In [48]:
L1_vectorstore = L1_process_document(test_documents)

Split document 1 into 33 sub-documents.
Split document 2 into 20 sub-documents.
Split document 3 into 21 sub-documents.
Split document 4 into 7 sub-documents.
Split document 5 into 47 sub-documents.
Split document 6 into 41 sub-documents.
Split document 7 into 24 sub-documents.
Split document 8 into 19 sub-documents.
Split document 9 into 19 sub-documents.
Split document 10 into 13 sub-documents.
Split document 11 into 31 sub-documents.
Split document 12 into 12 sub-documents.
Split document 13 into 23 sub-documents.
Split document 14 into 42 sub-documents.
Split document 15 into 16 sub-documents.
Split document 16 into 38 sub-documents.
Split document 17 into 91 sub-documents.
Split document 18 into 13 sub-documents.
Split document 19 into 27 sub-documents.
Split document 20 into 15 sub-documents.
Split document 21 into 33 sub-documents.
Split document 22 into 13 sub-documents.
Split document 23 into 25 sub-documents.
Split document 24 into 22 sub-documents.
Split document 25 into 39 

In [49]:
#Test Load
print(L1_vector_folder)
L1_vectorstore_testload = Chroma(
    persist_directory=L1_vector_folder,
    embedding_function=embeddings
)
#Verify local load result
total_docs = L1_vectorstore_testload._collection.count()
if total_docs > 0:
    print(f"Vectorstore contains {total_docs} documents")
else:
    print("Vectorstore is empty")

L1_vector_final
Vectorstore contains 808 documents
