In [None]:
!pip install -qU  langchain langchain_core huggingface_hub kagglehub chromadb langchain-groq
!pip install -qU  langchain langchain_community langchain_huggingface langchain-chroma taipy
!pip install -qU  gradio pypdf tiktoken sentence_transformers langgraph pandas matplotlib jq
!pip install -qU "langchain-chroma>=0.1.2" pinecone langchain_pinecone

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import userdata
import os
os.environ["PINECONE_API_KEY"] = userdata.get('pinecone_key')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ["LANGCHAIN_API_KEY"] = userdata.get('langchai_api_key')
os.environ["LANGCHAIN_PROJECT"] = 'Law GPT'
os.environ["GROQ_API_KEY"] = userdata.get('groq_api_key')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

#Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("ayeshajadoon/pakistan-law-data")

print("Path to dataset files:", path)

# Look at the downloaded files
files = os.listdir(path)
print("Files in dataset directory:", files)


file_paths = [os.path.join(path, file) for file in files]
print("Full paths to dataset files:", file_paths)

In [None]:
import json
import pandas as pd
import os

# Assuming file_paths is already defined and contains the path to the JSON file
if file_paths:
    json_file_path = file_paths[0] # Assuming the first file is the json

    # Load the JSON file into a pandas DataFrame
    df = pd.read_json(json_file_path)

    # Display the head of the DataFrame
    display(df.head())
else:
    print("No files found in file_paths.")


In [None]:
from langchain_community.document_loaders import JSONLoader, CSVLoader, PyPDFLoader

def data_loader(file_paths):
    all_documents = []
    for file_path in file_paths:
        if file_path.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_path.endswith('.json'):
            loader = JSONLoader(file_path, jq_schema='.', text_content=False)
        elif file_path.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        else:
            print(f"Unsupported file type: {file_path}")
            continue
        all_documents.extend(loader.load())
    return all_documents

data_loader(file_paths[:30])

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

# Data Splitter
chunk_size = 500
chunk_overlap = 50

def text_splitter(data):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap
    )
  split_text = text_splitter.split_documents(data)
  return split_text

In [None]:
## Vector db
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from pinecone import Pinecone
from uuid import uuid4


pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

def vector_database(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    index= pc.Index("lawindex")
    vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    # vector_store.add_documents(documents=chunks)
    return vector_store

In [None]:
## Retriever
def Retriever(file):
    splits = data_loader(file)
    chunks = text_splitter(splits)
    vectordb = vector_database(chunks) # vector_database now returns a Chroma vector store
    retriever = retriever = vectordb.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k": 5, "score_threshold": 0.4},
    )
    return retriever

retriever = Retriever(file_paths)

In [None]:
from typing import TypedDict , Dict
from langchain_core.messages import BaseMessage

class AgentState(TypedDict):
  keys: Dict[str , any]

In [None]:
from re import template
from langchain import hub
from langchain_core.output_parsers import StrOutputParser , PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel , Field
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.tools import tool
from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chat_models import init_chat_model

llm = init_chat_model("deepseek-r1-distill-llama-70b", model_provider="groq")

#nodes

def retrieve(state):
  print("----RETRIEVE----")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = retriever.get_relevant_documents(question)
  return {"keys": {"documents": documents , "question": question}}



# Attach File

def attach_file(state,user_file):
    print("----File Attachment----")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    file_content = data_loader(user_file)
    file_text = " ".join([doc.page_content for doc in file_content])
    documents.append(Document(page_content=file_text))
    return {"keys": {"documents": documents , "question": question}}

def generation(state):
  print("----GENERATION----")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]

  #prompt
  prompt = ChatPromptTemplate.from_template("""You are LAW GPT, a legal research assistant with a distinctive scholarly persona. Adopt these characteristics:

      1. **Persona**: A 19th-century legal scholar reincarnated as an AI. Use subtle Victorian-era formalism mixed with modern clarity. Signature phrases: "In the matter of...", "Wherefore we observe...", "The jurisprudence suggests..."

      2. **Information Integration**:
        - Always ground responses in retrieved legal documents. Add refrences from Pakistan Law acts
        - Cite sources using Bluebook-style abbreviations (e.g., 347 U.S. 483) as much  possible
        - When referencing statutes, include: <Jurisdiction> <Code> § <Section> (<Year>)

      3. **Response Structure**:
        ```legal
        [Emblematic Header]
        (e.g., "IN THE MATTER OF [USER'S QUERY BRIEF]")

        [Context Bridge]
        Connect query to historical legal evolution ("This question echoes the doctrinal shift in...")

        [Retrieved Authority]
        Present 2-3 most relevant provisions with pinpoint citations:
        • <Source 1> [Relevance explanation]
        • <Source 2> [Contrasting interpretation]

        [Modern Application]
        Apply principles to user's specific circumstances with hypothetical:
        "Consider a scenario where [user's situation]... Under [Cited Authority], the outcome would likely turn on..."

        [Scholarly Caveat]
        "We note jurisprudential tension in..." + "Consult local counsel regarding...
      User Query:
      {question}

      Retrieved Legal Documents:
      {context}
      """
  )

  rag_chain = prompt | llm | StrOutputParser()
  #generation
  generation = rag_chain.invoke({"context": documents , "question": question})
  return {"keys": {"generation": generation , "documents" : documents , "question": question}}

def grade_documents(state):
  print("----Check Relevance----")
  state_dict= state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]

  class grade(BaseModel):
    """ check the relevance documents"""

    binary_score: str = Field(
        description=("Check binary score 'yes' or 'no' ")
    )

  #prompt
  prompt = PromptTemplate(
      template= """You are a grader accessing relevance of retrieved documents to the user question.\n
      Here is the retrieved documnets:\n{context}.\n
      Here is the user Question:{question}.\n
      If the documents contain keyword(s) or semantic meaning relative to the user question ,grade them as relevant.\n
      Give a relevance score 'yes' or 'no' score for all documents to indicate that weather all documents are relevant.\n
      your response should be in json format:""",
      input_variables=["context" , "question"]
  )

  chain= prompt | llm.with_structured_output(grade,include_raw=True)

  search = "no"
  filtered_docs=[]
  for d in documents:
    score = chain.invoke({"context": d.page_content , "question": question})
    if isinstance(score, tuple) and len(score) > 0 and hasattr(score[0], 'binary_score'):
      if score[0].binary_score == "yes":
            filtered_docs.append(d)
      else:
            print("----Documents are not relevant")
            search = "yes"
    else:
        # If structured output parsing failed, print message and potentially log the issue
        print("----Could not parse relevance score, skipping document")
        # Optionally, you can log the raw 'score' value for debugging
  return {"keys": {"documents": filtered_docs , "question": question , "search": search}}

In [None]:
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.schema import Document

def translate_query(state):
  print("----Translate Query----")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]

  #prompt
  prompt = PromptTemplate(
      template=""" You are generating question that is well optimized for retrieval.\n
      Look at the input and try to reason about underlying semantic intent / meanings.\n
      Here is the initial question:
      \n-------\n
      {question}
      \n-------\n
      Formulate an improved question:""",
      input_variables=["question"]
  )

  #chain
  chain= prompt | llm | StrOutputParser()
  new_question = chain.invoke({"question": question})
  return {"keys": {"question": new_question , "documents": documents}}

def web_search(state):
  print("----Web Search----")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]

  tavily = TavilySearchResults(max_results=1)
  tavily_search= tavily.invoke(question)
  web_results = "\n".join([d["content"] for d in tavily_search])
  web_results = Document(page_content=web_results)
  documents.append(web_results)
  return {"keys": {"documents": documents , "question": question}}

def decide(state):
  print("----Decide----")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]
  search = state_dict["search"]

  if search == "yes":
    print("----DECISION: Translate Query and Search the Web")
    return "translate"
  else:
    print("----DECISION: Generation")
    return "generation"



In [None]:
from langgraph.graph import StateGraph ,END

graph = StateGraph(AgentState)

graph.add_node("retrieve" , retrieve) #retrieve
graph.add_node("generation", generation) #generation
graph.add_node("grade_documents", grade_documents) #grade_documents
graph.add_node("translate_query", translate_query) #translate
graph.add_node("web_search", web_search) #web_search


graph.set_entry_point("retrieve")
graph.add_edge("retrieve" , "grade_documents")
graph.add_conditional_edges(
    "grade_documents",
    decide,
    {
        "translate": "translate_query",
        "generation": "generation"
    }
)
graph.add_edge("translate_query", "web_search" )
graph.add_edge("web_search","generation")
graph.add_edge("generation", END)

app = graph.compile()

In [None]:
inputs = "what is pakistan law about public tax?"
for output in app.stream({"keys": {"question":inputs}},{"recursion_limit":150}):
  for key, value in output.items():
    print(f"{key}: {value}")

In [None]:
def law_gpt_interface(question: str, file):
    """
    Processes a user query and an optional file attachment using the LangGraph RAG chain.

    Args:
        question: The user's text query.
        file: The uploaded file object from Gradio (can be None).

    Returns:
        The generated response from the RAG chain.
    """
    inputs = {"keys": {"question": question}}

    # If a file is attached, process it and add its content to the state
    if file is not None:
        attach_file()

    response = ""
    # Stream the response from the LangGraph app
    for output in app.stream(inputs, {"recursion_limit": 150}):
        for key, value in output.items():
            if key == "generation":
                # Extract the generated text from the output
                if "keys" in value and "generation" in value["keys"]:
                    response += value["keys"]["generation"]
    return response

In [None]:
import gradio as gr

with gr.Blocks(css="""
    body { font-family: sans-serif; }
    .gradio-container { max-width: 2000px; margin: auto; padding: 20px; background-color: #f0f0f0; border-radius: 10px; }
    h1 { text-align: center; color: #0056b3; }
    .input-box { border: 1px solid #ccc; padding: 10px; border-radius: 5px; background-color: #fff; }
    .output-box { border: 1px solid #ccc; padding: 10px; border-radius: 5px; background-color: #fff; min-height: 200px; }
""", fill_height=True) as demo:
    gr.HTML("<h1 style='color: #0056b3;'>🏛️ Law GPT Agent ⚖️</h1>")
    gr.Markdown("Your personal legal research assistant. Ask a question or upload a document.")

    with gr.Row():
        query_input = gr.Textbox(label="Enter your legal question:", placeholder="e.g., What is the law on contracts in Pakistan?")
        file_input = gr.File(label="Upload relevant document (optional)")

    submit_button = gr.Button("Get Legal Insight")

    output_text = gr.Textbox(label="Legal Insight:", interactive=False, lines=10, autoscroll=True)

    submit_button.click(
        fn=law_gpt_interface,
        inputs=[query_input, file_input],
        outputs=output_text
    )

demo.launch(debug=True)