In [None]:
!docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16

Step 1, extract data from PDF using lamma parse

In [1]:
import nest_asyncio
from dotenv import load_dotenv
import os
nest_asyncio.apply()
load_dotenv()

True

In [11]:
# bring in deps
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# set up parser
parser = LlamaParse(
    result_type="markdown"  # "markdown" and "text" are available
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['data/Constitution_of_the_Republic_of_Singapore.pdf'], file_extractor=file_extractor).load_data()
print(documents)

Started parsing the file under job_id 8512c45a-70af-4b75-b5c2-8363a304a18d
[Document(id_='a693e0e6-ffe5-4913-a41b-7d2abcffc865', embedding=None, metadata={'file_path': 'data/Constitution_of_the_Republic_of_Singapore.pdf', 'file_name': 'Constitution_of_the_Republic_of_Singapore.pdf', 'file_type': 'application/pdf', 'file_size': 602809, 'creation_date': '2025-01-27', 'last_modified_date': '2025-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='# Constitution of the Republic of Singapore\n\n# Table of Contents\n\n# Part 1 PRELIMINARY\n\n1. Citation\n2. Interpretation\n\n# Part 2 THE REPUBLIC AND THE CONSTITUTION\n\n1. Republic of Singa

Use openai embedding to save

In [2]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [3]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "my_docs"

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [14]:
docs = list()
for doc in documents:
    docs.append(Document(
        page_content=doc.text,
        metadata={
            "id": doc.id_
        }
    ))

In [15]:
import pickle

# Assuming 'docs' is the variable containing the documents you want to save
with open('docs.pkl', 'wb') as file:
    pickle.dump(docs, file)


In [6]:
import pickle

# Assuming 'docs' is the variable containing the documents you want to save
with open('data/constitution.pkl', 'rb') as file:
    docs = pickle.load(file)

EOFError: Ran out of input

In [None]:
vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

In [7]:
def retrieve_top_k(query, vector_db, k=3):
    return vector_db.similarity_search(query, k=k)

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

In [58]:
from langgraph.prebuilt import create_react_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

def decide_action(query, chat_model):
    """
    Uses an LLM to decide whether to use tools or rely on RAG.
    Returns either 'USE_TOOL' or 'USE_RAG'.
    """
    decision_prompt = f"""
    You are an intelligent assistant specializing in legal queries. Your job is to decide whether the user's query \
    requires an external search using tools (e.g., searching for the latest case laws or regulations) or can be \
    answered using the retrieved legal documents provided.
    
    Based on the query below, respond with either "USE_TOOL" or "USE_RAG" and provide a brief justification.
    
    Query: {query}
    
    Decision (USE_TOOL/USE_RAG):
    """
    decision_response = chat_model.invoke([HumanMessage(content=decision_prompt)])
    decision_text = decision_response.content.strip().split("\n")[0]
    return decision_text

def get_chatgpt_response(query, retrieved_docs):
    # Initialize model and tools
    chat_model = ChatOpenAI(temperature=0)
    search = TavilySearchResults(max_results=2)
    tools = [search]

    # Step 1: Decide action using LLM
    action = decide_action(query, chat_model)
    print(f"Decision: {action}")

    if action == "USE_TOOL":
        agent_executor = create_react_agent(chat_model, tools)
        # Use tools to perform external search
        response = agent_executor.invoke({"messages": [HumanMessage(content=query)]})
        print(f"Using Tool Response: {response}")
        return response
    elif action == "USE_RAG":
        # Use RAG (retrieved documents)
        context = "\n".join([doc.page_content for doc in retrieved_docs])
        rag_prompt = f"""
        You are a highly skilled legal expert specializing in Singaporean law, with extensive experience in drafting contracts, \
        interpreting legislation, and providing sound legal advice. Using the following retrieved legal context, provide a clear, \
        concise, and accurate response to the user's query. Where necessary, reference specific clauses or legal principles mentioned in the context.
        
        Context:
        {context}
        
        Question:
        {query}
        
        Answer:
        """
        response = chat_model.invoke([HumanMessage(content=rag_prompt)])
        print(f"Using RAG Response: {response.content}")
        return response
    else:
        raise ValueError(f"Invalid decision: {action}")

In [None]:
query = "Use the internet, tell me what is the weather"

# Step 3: Retrieve top-k relevant documents
retrieved_docs = retrieve_top_k(query, vector_store, k=3)

# Step 4: Get ChatGPT response
response = get_chatgpt_response(query, retrieved_docs)

print("ChatGPT Response:")
print(response)

Decision: USE_TOOL
Using Tool Response: {'messages': [HumanMessage(content='Use the internet, tell me what is the weather', additional_kwargs={}, response_metadata={}, id='77ee37f4-4ac3-4e01-9b15-d5ace27766e3'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_42ZtWHgJXgXarYVckMyar52g', 'function': {'arguments': '{"query":"current weather"}', 'name': 'tavily_search_results_json'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 91, 'total_tokens': 111, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-39105fd8-884c-4ff6-82d3-59858a9717fc-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'current weather