In [1]:
pip install transformers sentence-transformers langchain torch faiss-cpu numpy langchain_community langchain_huggingface huggingface_hub pypdf


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.11-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.0-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.14.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Using cached httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading

In [1]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
# Download documents about IPC/BNS to local directory.
os.makedirs("legal_doc", exist_ok=True)
files = [
    "https://www.indiacode.nic.in/bitstream/123456789/20062/1/a2023-45.pdf",
    "https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf",

]
for url in files:
    file_path = os.path.join("legal_doc", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [3]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./legal_doc/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'legal_doc\\250883_english_01042024.pdf', 'page': 0}, page_content='THE BHARA TIYA NY AYA SANHITA, 2023\nNO. 45 OF 2023\n[25th December ,2023.]\nAn Act to consolidate and amend the provisions relating to offences and for\nmatters connected therewithor incidental thereto.\nBE it enacted by Parliament in the Seventy-fourth Year of the Republic of India as\nfollows:––\nCHAPTERI\nPRELIMINARY\n1.(1) This Act may be called the Bharatiya Nyaya Sanhita, 2023.\n(2) It shall come into force on such date as the Central Government may , bynotification\nin the Official Gazette, appoint, and different dates maybe appointed for different provisions\nof this Sanhita.\nShort title,\ncommencement\nand\napplication.\nvlk/kkj.k\nEXTRAORDINARY\nHkkx II — [k.M 1\nPART II — Section 1')

In [4]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 210 documents loaded, with average characters equal to 3834.
After split, there were 1327 documents (chunks), with average characters equal to 611 (average chunk length).


In [5]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name= "sentence-transformers/all-MiniLM-l6-v2",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

  from .autonotebook import tqdm as notebook_tqdm





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-5.15554398e-02  1.19227022e-02  3.48791964e-02 -3.66533920e-02
 -6.34172186e-02  7.31309578e-02  1.10293943e-02 -5.15519781e-03
 -8.70651901e-02 -1.24147143e-02  1.10088006e-01 -9.77770705e-03
  4.12367750e-03  2.02097706e-02  4.15692851e-03  5.93829080e-02
  1.69809011e-03 -5.98540297e-03 -8.06456283e-02  3.82104144e-02
  7.63670206e-02  1.69246662e-02 -6.73024207e-02  1.77465919e-02
 -8.21642131e-02 -3.07455240e-03  2.58129947e-02 -3.00907996e-02
  2.12154631e-02  1.95332281e-02 -1.31415585e-02  5.65027148e-02
  3.95049043e-02  9.90178715e-03  2.40221675e-02  9.07810498e-03
 -2.41600606e-03  9.80894919e-03  4.07233238e-02 -5.04606776e-02
  2.38185190e-02 -7.36906528e-02  3.31321061e-02 -1.50829116e-02
  3.47401537e-02 -1.37040708e-02 -2.72059515e-02 -2.72402521e-02
 -5.70358448e-02 -1.72249433e-02 -5.99213988e-02 -5.41995559e-03
  1.08953612e-03  3.15294228e-02 -1.36010081e-03 -9.91329625e-02
 -5.25923930e-02 -1.27455452e-02 -1.02395536e-02  2

In [7]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [8]:
query = "What is the punishment for murder under IPC?"
         # Sample question, change to other questions you are interested in.
# Print the number of relevant documents
#print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query.\n')

# Iterate through the documents and print each one
#for i, doc in enumerate(relevant_documents):
#   print(f"Document {i + 1}:\n{doc.page_content}\n")
relevant_documents = vectorstore.similarity_search(query)
#print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

knowledge and under such circumstances that, if he by that act caused death, he would be guilty of 
culpable homicide not amounting to murder, shall be punished with imprisonment of  either description 
for a term which may extend to three years, or with fine, or with both; and, if hurt is caused to any person 
by such act, shall be punished with imprisonment of either description for a term which may extend to 
seven years, or with fine, or with both. 
Illustration 
A, on grave and sudden provocation, fires a pistol at Z, under such circumstances that if he thereby 
caused death, he would be guilty of culpable homicide not amounting to murder. A has committed the


In [9]:
query = "What is the punishment for theft under IPC?"
         # Sample question, change to other questions you are interested in.
# Print the number of relevant documents
#print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query.\n')

# Iterate through the documents and print each one
#for i, doc in enumerate(relevant_documents):
#   print(f"Document {i + 1}:\n{doc.page_content}\n")
relevant_documents = vectorstore.similarity_search(query)
#print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

55 
 
134. Assault or criminal force in attempt to commit theft of property carried by a person .—
Whoever assaults or uses criminal force to any person, in attempting t o commit theft on any property 
which that person is then wearing or carrying, shall be punished with imprisonment of either description 
for a term which may extend to two years, or with fine, or with both. 
135. Assault or criminal force in attempt to wrongfully confine a person .—Whoever assaults or 
uses criminal force to any person, in attempting wrongfully to confine that person, shall be punished with 
imprisonment of either description for a term which may exte nd to one year, or with fine which may


In [10]:
query = "What is the punishment for murder under the Indian Penal Code?"
# Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)

# Print the number of relevant documents
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query.\n')

# Iterate through the documents and print each one
for i, doc in enumerate(relevant_documents):
    print(f"Document {i + 1}:\n{doc.page_content}\n")


There are 4 documents retrieved which are relevant to the query.

Document 1:
Here A is guilty of the offence defined in this section. 
Explanation.—In section 211 and in this section the word “offence” include any act committed at any 
place out of India, which, if committed in India, would be punishable under any of the following sections, 
namely, 103, 105, 307, sub -sections (2), (3) and ( 4) of section 309, sub -sections (2), (3), (4) and ( 5) of 
section 310, 311, 312, clauses ( f) and (g) of section 326, sub -sections (4), (6), (7) and (8) of section 331, 
clauses (a) and ( b) of section 332 and the word “offender” includes any person who is alleged to have 
been guilty of any such act.

Document 2:
358.(1) The Indian Penal Code is hereby repealed.
(2) Notwithstanding the repeal of the Code referred to in sub-section (1), it shall not
affect,—
(a) the previous operation of the Code so repealed or anything duly done or
suffered thereunder; or
(b) any right, privilege, obligation 

In [11]:
print("Embedding for first document:", huggingface_embeddings.embed_query(docs_after_split[0].page_content))


Embedding for first document: [-0.051555439829826355, 0.011922702193260193, 0.03487919643521309, -0.036653392016887665, -0.0634172186255455, 0.07313095778226852, 0.011029394343495369, -0.005155197810381651, -0.087065190076828, -0.012414714321494102, 0.11008800566196442, -0.009777707047760487, 0.004123677499592304, 0.020209770649671555, 0.004156928509473801, 0.059382908046245575, 0.0016980901127681136, -0.0059854029677808285, -0.08064562827348709, 0.038210414350032806, 0.07636702060699463, 0.016924666240811348, -0.06730242073535919, 0.017746591940522194, -0.0821642130613327, -0.003074552398175001, 0.025812994688749313, -0.030090799555182457, 0.021215463057160378, 0.019533228129148483, -0.01314155850559473, 0.05650271475315094, 0.03950490429997444, 0.009901787154376507, 0.024022167548537254, 0.009078104980289936, -0.0024160060565918684, 0.009808949194848537, 0.040723323822021484, -0.05046067759394646, 0.023818518966436386, -0.07369065284729004, 0.03313210606575012, -0.01508291158825159, 

In [12]:
# Print the number of documents stored in the vector store
num_vectors = vectorstore.index.ntotal
print(f"Number of documents in the vector store: {num_vectors}")

# Check retrieved documents
query = "What is the punishment for murder under the Indian Penal Code?"
relevant_documents = vectorstore.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved {len(relevant_documents)} documents.")
#for doc in relevant_documents:
print(doc.page_content[:500])  # Print first 500 characters of each document


Number of documents in the vector store: 1327
Query: What is the punishment for murder under the Indian Penal Code?
Retrieved 4 documents.
Explanation.—In section 211 and in this section the word “offence” include any act
committed at any place out of India, which, if committed in India, would be punishable
under any of the following sections, namely, 103, 105, 307, sub-sections (2), (3) and (4) of
section 309, sub-sections (2), (3), (4) and (5) of section 310, 311, 312, clauses (f) and (g) of
section 326, sub-sections (4), (6), (7) and (8) of section 331, clauses (a) and (b) of
section 332 and the word “offender” includes any pers


In [14]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":1})

In [15]:
import huggingface_hub
from langchain.llms import HuggingFaceHub
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Token with Read permissions

# Define the Hugging Face Hub LLM
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",  # Replace with the correct model ID
    model_kwargs={"temperature": 0.1, "max_length": 75, "stop_sequence":["\n"]}
)

# Define a query to ask the model
query = "What is the punishment for murder under IPC?"

# Invoke the model and print the result
response = hf.invoke(query)
print(response)


  hf = HuggingFaceHub(


What is the punishment for murder under IPC?

The punishment for murder under Section 302 of the Indian Penal Code is death or imprisonment for life.

What is the punishment for murder under IPC?

The punishment for murder under Section 302 of the Indian Penal Code is death or imprisonment for life.

What is the punishment for murder under IPC?

The punishment for murder under Section 302 of the Indian Penal Code is death or imprisonment for


In [16]:
import huggingface_hub
from langchain.llms import HuggingFaceHub
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Token with Read permissions

# Define the Hugging Face Hub LLM
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",  # Replace with the correct model ID
    model_kwargs={"temperature": 0.1, "max_length": 75}
)

# Define a query to ask the model
query = "What is the punishment for theft under IPC?"

# Invoke the model and print the result
response = hf.invoke(query)
print(response)


What is the punishment for theft under IPC?

Theft is a crime under Section 378 of the Indian Penal Code, 1860. The punishment for theft is imprisonment for a term which may extend to three years, or with fine, or with both.

What is the punishment for theft in India?

The punishment for theft is imprisonment for a term which may extend to three years, or with fine, or with both.

What is the punishment for theft in India?


In [17]:
import huggingface_hub
from langchain.llms import HuggingFaceHub
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Token with Read permissions

# Define the Hugging Face Hub LLM
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",  # Replace with the correct model ID
    model_kwargs={"temperature": 0.3, "max_length": 30, "stop_sequence":["\n"]}
)

# Define a query to ask the model
query = """ A is in a house which is on fire, with Z, a child. People below hold out a blanket. A
drops the child from the house top, knowing it to be likely that the fall may kill the child, but
not intending to kill the child, and intending, in good faith, the child’s benefit.Has A committed an offence?"""

# Invoke the model and print the result
response = hf.invoke(query)
print(response)


 A is in a house which is on fire, with Z, a child. People below hold out a blanket. A
drops the child from the house top, knowing it to be likely that the fall may kill the child, but
not intending to kill the child, and intending, in good faith, the child’s benefit.Has A committed an offence?

#### Top Answer

Dear Student, The answer of your question is given below: A has not committed an offence. The reason is that A did not intend to kill the child. He... View the full answer


In [18]:
import streamlit as st
import os
from urllib.request import urlretrieve
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceHub

# Initialize the app
st.title("Legal Document Query System")
st.sidebar.header("Settings")

# Step 1: File Upload or Use Existing Documents
st.header("Upload or Use Pre-loaded Documents")
uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)

# Directory to save files
os.makedirs("legal_doc", exist_ok=True)

def save_uploaded_files(files):
    for file in files:
        with open(os.path.join("legal_doc", file.name), "wb") as f:
            f.write(file.read())
if uploaded_files:
    save_uploaded_files(uploaded_files)
    st.success("Uploaded files successfully!")

# Step 2: Process Documents
if st.button("Process Documents"):
    loader = PyPDFDirectoryLoader("./legal_doc/")
    docs_before_split = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=50
    )
    docs_after_split = text_splitter.split_documents(docs_before_split)
    st.write(f"Processed {len(docs_after_split)} document chunks.")

    # Save embeddings
    huggingface_embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(docs_after_split, huggingface_embeddings)
    vector_store.save_local("faiss_index")
    st.success("Vector store created and saved.")

# Step 3: Query Interface
st.header("Query the System")
query = st.text_area("Enter your legal query:")
if st.button("Get Answer"):
    if not os.path.exists("faiss_index"):
        st.error("Please process documents first.")
    else:
        # Load FAISS vector store
        vector_store = FAISS.load_local("faiss_index", huggingface_embeddings)

        # Perform retrieval
        docs = vector_store.similarity_search(query, k=3)
        context = "\n".join([doc.page_content for doc in docs])

        # Initialize HuggingFace model
        os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Replace with your API token
        hf = HuggingFaceHub(
            repo_id="mistralai/Mistral-7B-v0.1",
            model_kwargs={"temperature": 0.3, "max_length": 200}
        )

        # Prompt Template
        prompt = PromptTemplate(
            input_variables=["context", "query"],
            template="""
            Context: {context}

            Question: {query}
            Answer:"""
        )

        # Get response
        final_prompt = prompt.format(context=context, query=query)
        response = hf.invoke(final_prompt)
        st.subheader("Answer")
        st.write(response)

# Step 4: Debug Information
if st.sidebar.checkbox("Show Debug Info"):
    st.sidebar.write("Documents Directory: ./legal_doc/")
    st.sidebar.write("FAISS Index Location: ./faiss_index/")


2024-12-13 18:52:54.101 
  command:

    streamlit run C:\Users\Riddhi Murugan\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-12-13 18:52:54.132 Session state does not function when running a script without `streamlit run`


In [None]:
import streamlit as st
import os
from urllib.request import urlretrieve
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceHub

# Initialize the app
st.title("Legal Document Query System")
st.sidebar.header("Settings")

# Step 1: File Upload or Use Existing Documents
st.header("Upload or Use Pre-loaded Documents")
uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)

# Directory to save files
os.makedirs("legal_doc", exist_ok=True)

def save_uploaded_files(files):
    for file in files:
        with open(os.path.join("legal_doc", file.name), "wb") as f:
            f.write(file.read())
if uploaded_files:
    save_uploaded_files(uploaded_files)
    st.success("Uploaded files successfully!")

# Step 2: Process Documents
if st.button("Process Documents"):
    loader = PyPDFDirectoryLoader("./legal_doc/")
    docs_before_split = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=50
    )
    docs_after_split = text_splitter.split_documents(docs_before_split)
    st.write(f"Processed {len(docs_after_split)} document chunks.")

    # Save embeddings
    huggingface_embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(docs_after_split, huggingface_embeddings)
    vector_store.save_local("faiss_index")
    st.success("Vector store created and saved.")

# Step 3: Query Interface
st.header("Query the System")
query = st.text_area("Enter your legal query:")
if st.button("Get Answer"):
    if not os.path.exists("faiss_index"):
        st.error("Please process documents first.")
    else:
        # Load FAISS vector store
        vector_store = FAISS.load_local("faiss_index", huggingface_embeddings)

        # Perform retrieval
        docs = vector_store.similarity_search(query, k=3)
        context = "\n".join([doc.page_content for doc in docs])

        # Initialize HuggingFace model
        os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Replace with your API token
        hf = HuggingFaceHub(
            repo_id="mistralai/Mistral-7B-v0.1",
            model_kwargs={"temperature": 0.3, "max_length": 200}
        )

        # Prompt Template
        prompt = PromptTemplate(
            input_variables=["context", "query"],
            template="""
            Context: {context}

            Question: {query}
            Answer:"""
        )

        # Get response
        final_prompt = prompt.format(context=context, query=query)
        response = hf.invoke(final_prompt)
        st.subheader("Answer")
        st.write(response)

# Step 4: Debug Information
if st.sidebar.checkbox("Show Debug Info"):
    st.sidebar.write("Documents Directory: ./legal_doc/")
    st.sidebar.write("FAISS Index Location: ./faiss_index/")
