## Analysis

In [5]:
import os

In [20]:
files = os.listdir("../scraped_data/stateacts")

In [21]:
import json
import re
import matplotlib.pyplot as plt

In [23]:
doc_count = 0
for file in files:
    with open(f"../scraped_data/stateacts/{file}","r",encoding="utf-8") as f:
        x = json.load(f)
    years = list(x.keys())[:-1]
    doc_len = [ (doc["doc_id"]) for year in years for doc in x[year]]
    doc_count += len(doc_len)
doc_count

14225

In [12]:
years = list(x.keys())[:-1]
doc_len = [ (year,doc["doc_id"]) for year in years for doc in x[year]]


In [13]:
len(doc_len)

9

## Random Docs Selections from JSON


In [3]:
import json
import numpy as np
import random

In [4]:
def randomly_select_docs(json_file,return_json_filename,sampling_size):
    with open(json_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    random_dict = {}
    years = list(raw_data.keys())[:-1]
    np.random.shuffle(years)
    for year in years:
        k = min(sampling_size, len(raw_data[year]))
        random_dict[year] = random.sample(raw_data[year],k)
    with open(f"./{return_json_filename}.json","w",encoding="utf-8") as f:
        json.dump(random_dict,f,indent=4,ensure_ascii=False)

## JSON to Documents

In [1]:
from datapreprocessing.json_to_docs import convert_json_to_docs

In [2]:
list_of_documents = convert_json_to_docs("../scraped_data/stateacts/assam-act.json")

Processing: 100%|██████████| 94/94 [00:00<00:00, 220.15it/s]


## Chunking

In [3]:
from datapreprocessing.docs_to_chunks import convert_docs_to_chunks

In [4]:
final_chunks = convert_docs_to_chunks(list_of_documents)

Processing: 100%|██████████| 4705/4705 [00:01<00:00, 2962.08it/s]


In [5]:
len(final_chunks)

9199

## Context Enrichment

### Merge Shorter Chunks

In [7]:
from datapreprocessing.chunks_to_chunks import merge_shorter_chunks

In [None]:
final_chunks = merge_shorter_chunks(final_chunks,chunk_overlap=250,threshold=1000)

Before Processing docs:  [25, 31, 113, 359, 512, 531, 653, 669, 725, 757, 762, 780, 792, 817, 852, 884, 888, 952, 990, 1021, 1610, 1659, 1661, 1663, 1810, 1864, 1958, 1961, 1963, 1972, 1983, 2009, 2014, 2038, 2120, 2133, 2342, 2351, 2354, 2441, 2461, 2463, 2560, 2572, 2612, 2617, 2624, 2699, 2715, 2725, 2743, 2751, 2757, 2775, 2830, 2857, 2866, 2877, 2879, 3064, 3074, 3119, 3131, 3233, 3273, 3302, 3305, 3517, 3546, 3563, 3577, 3617, 3637, 3670, 3681, 3693, 3708, 3732, 3762, 3840, 3846, 3860, 3889, 3938, 3955, 3964, 3969, 3977, 3981, 3983, 4011, 4032, 4046, 4076, 4185, 4213, 4256, 4262, 4273, 4275, 4354, 4362, 4370, 4424, 4444, 4472, 4561, 4660, 4677, 4690, 4703, 4968, 5017, 5039, 5059, 5075, 5125, 5175, 5229, 5297, 5314, 5336, 5354, 5371, 5391, 5403, 5413, 5423, 5432, 5494, 5508, 5521, 5579, 5585, 5606, 5619, 5638, 5657, 5675, 5717, 5725, 5740, 5746, 5754, 5825, 5849, 5860, 5988, 6052, 6110, 6151, 6263, 6269, 6362, 6425, 6447, 6466, 6469, 6536, 6577, 6608, 6613, 6645, 6649, 6730, 6751,

Merging: 100%|██████████| 208/208 [00:00<00:00, 41387.89it/s]

Before Processing docs:  []





In [None]:
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
import random
import time
import pprint

pp = pprint.PrettyPrinter(width=80)


context_enrichment_template = PromptTemplate.from_template("""
You are a legal summarization assistant. The text below is a chunk from an Indian legal {doctype} document. 

Instructions (follow strictly):
1. Identify all Section/Rule numbers mentioned in the chunk.  
2. For each Section/Rule, prepare a structured summary with the format shown below.  
3. Capture **all subsections and clauses** (do not stop midway). If the chunk is truncated, mark the summary as [Incomplete].  
4. Always separate "Key Conditions/Grounds" (what must or must not be done) from "Enforcement or Penalty Provisions" (consequences).  
5. Use precise statutory style: short, numbered, and formal.  
6. Do not invent missing information. If absent, write "Not specified."  
7. Always include all years, degrees, or fees mentioned.  
8. Mark each line with [Actor], [Condition], [Enforcement], or [Note] where appropriate.  

Structured Summary Format:
- Section Number:  
- Heading (exact text from the chunk):  
- Actors (Institutions / Authorities / Professions involved):  
- Degrees / Courses:  
- Key Conditions / Grounds (numbered list):  
- Enforcement or Penalty Provision:  
- Notes:  

Here’s the chunk:
{page_content}

Metadata:
Title: {title}  
Source: {source}  
Link: {link}  
Jurisdiction: {jurisdiction}  
Type: {doctype}
""")

metadata_template = PromptTemplate.from_template("""
This chunk is extracted from the legal document: {title}, cited by {cited_by}.
Source: {source} | Jurisdiction: {jurisdiction} | Type: {doctype}
Document link: {link}
""")

llm = ChatOllama(model="llama3.2:3b")

context_enrichment_chain = context_enrichment_template | llm

In [None]:
def inject_context_to_chunks(chunk):
    metadata = {"link":chunk.metadata["link"],
                "title":chunk.metadata["title"],
                "source":chunk.metadata["source"],
                "jurisdiction":chunk.metadata["jurisdiction"],
                "doctype":chunk.metadata["doctype"],
                "cited_by":chunk.metadata["cited_by"]}
    header_1 = metadata_template.format(**metadata)
    del metadata["cited_by"]
    metadata["page_content"]=chunk.page_content
    header_2 = "".join(context_enrichment_chain.invoke(metadata).content.split("\n\n")[1:])
    return header_1+"\n\nSummary of the Chunk:\n"+header_2+"\n\nDocument Fragment:\n"+chunk.page_content
    

In [None]:
pp.pprint(inject_context_to_chunks(final_chunks[random.randint(0,10000)]))

('\n'
 'This chunk is extracted from the legal document: Rules of Procedure and '
 'Conduct of Business in Lok Sabha, cited by 0.\n'
 'Source: Union of India - Act | Jurisdiction: Central | Type: Rule\n'
 'Document link: https://indiankanoon.org/doc/82079480/\n'
 '\n'
 '\n'
 'Summary of the Chunk:\n'
 '**Section 61**- Heading:\n'
 '  - Time for taking up motion.\n'
 '- Actors:\n'
 '  - Speaker, members concerned, Minister\n'
 '- Degrees / Courses:\n'
 '  - Not specified\n'
 '- Key Conditions / Grounds:\n'
 "  1. The motion 'that the House do now adjourn' shall be taken up at 16.00 "
 'hours or at an earlier hour if the Speaker so directs.\n'
 '- Enforcement or Penalty Provision:\n'
 '  - If less than fifty members rise in favour of leave being granted, the '
 'Speaker informs that the member does not have the leave of the House.\n'
 '- Notes:\n'
 '  [Incomplete]**Section 62**- Heading:\n'
 '  - Closure of debate.\n'
 '- Actors:\n'
 '  - Speaker\n'
 '- Degrees / Courses:\n'
 '  - Not sp

In [55]:
from pprint import pprint

for i in range(10):
    chunk = final_chunks[random.randint(0, 9000)]
    chunk_metadata = chunk.metadata
    text = context_enrichment_chain.invoke({
        "page_content": chunk.page_content,
        "title": chunk_metadata["title"],
        "source": chunk_metadata["source"],
        "link": chunk_metadata["link"],
        "jurisdiction": chunk_metadata["jurisdiction"],
        "doctype": chunk_metadata["doctype"]
    }).content

    print(f"\n--- iteration {i} ---\n")
    print(text+"\n")  # readable output
    pp.pprint(chunk.page_content)



--- iteration 0 ---

Here are the summaries for each Section/Rule:

**1. Section 41**

- Heading: Subordinate Officer to the Chairman
- Actors: Council, Nagar Panchayat
- Degrees / Courses:
- Not specified
- Key Conditions / Grounds:
[Actor] Hayat or Council, [Condition] as the case may be,
shall be subordinate to him.
[Actor] State Government and Nagar Panchayat or Council, [Condition] as the case may be, shall appoint an Executive Officer.
[Actor] If any particular Nagar Panchayat or Council, [Condition] as the case may be, does not make such appointment, [Actor] State Government may appoint any person as such officer.

- Enforcement or Penalty Provision:
Not specified

**2. Section 42**

- Heading: Main functions of the Executive Officer
- Actors: Executive Officer, Nagar Panchayat or Council
- Degrees / Courses:
- Not specified
- Key Conditions / Grounds:
[Actor] The Executive Officer shall,
1. [Condition] subject to the general control of the Chairperson.
2. [Condition] perform a

In [120]:
context_enrichment_chain = context_enrichment_prompt | llm
text = context_enrichment_chain.invoke({"page_content":chunk.page_content,
                       "title":chunk_metadata["title"],
                       "source":chunk_metadata["source"],
                       "link":chunk_metadata["link"],
                       "jurisdiction":chunk_metadata["jurisdiction"],
                       "doctype":chunk_metadata["doctype"]}).content

In [121]:
text.split("\n\n")[-1]

'The Indian Partnership Act, 1932, is an Act to define and amend the law relating to partnership. It is enacted as follows: 1. Short title, extent and commencement. (1) This Act may be called the Indian Partnership Act, 1932. (2) It extends to the whole of India except the State of Jammu and Kashmir. (3) It shall come into force on the 1st day of October, 1932, except section 69, which shall come into force on the 1st day of October, 1933. 2. Definitions. In this Act, unless there is anything repugnant in the subject or context,— (a) an “act of a firm” means any act or omission by all the partners, or by any partner or agent of the firm which gives rise to a right enforceable by or against the firm; (b) “business” includes every trade, occupation and profession; (c) “prescribed” means prescribed by rules made under this Act; (d) “third party” used in relation to a firm or to a partner therein means any person who is not a partner in the firm; and (e) expressions used but not defined in

## Embedding

In [93]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_ollama import OllamaEmbeddings
import uuid
import time
from tqdm import tqdm
qdrant_url = "http://localhost:6333"
EMBED_DIMENSION=768
DISTANCE="Cosine"
COLLECTION_NAME="Union_law_docs"
deletecollection = True

In [18]:
## Make Sure the Docker is up and running (docker run -p 6333:6333 qdrant/qdrant)

client = QdrantClient(url=qdrant_url,prefer_grpc=False)

## Embedding Model
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

if deletecollection:
    ## Create or reset collection
    if client.collection_exists(COLLECTION_NAME):
        client.delete_collection(COLLECTION_NAME)

    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=EMBED_DIMENSION, distance=DISTANCE)
        )

KeyboardInterrupt: 

### Testing Embedding time for different batches

In [None]:
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Different chunk sizes to test
chunk_sizes = [1200, 1800]

results = {}

for chunk_size in chunk_sizes:
    print(f"\nTesting with chunk_size={chunk_size}")

    # Re-split documents with this chunk size
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        encoding_name="cl100k_base",
        chunk_size=chunk_size,
        chunk_overlap=100  # keep same overlap or adjust if you like
    )
    chunks = text_splitter.split_documents(list_of_documents)  # assuming final_chunks = original docs

    # Just test first batch (to keep fair)
    batch = chunks[:32]   # or any fixed batch size
    texts = [doc.page_content for doc in batch]

    start = time.time()
    embeddings = embedding_model.embed_documents(texts)
    end = time.time()

    elapsed = end - start
    results[chunk_size] = elapsed
    print(f"Chunk size {chunk_size}: {elapsed:.2f} seconds")

print("\nSummary:", results)



Testing with chunk_size=1200
Chunk size 1200: 1.36 seconds

Testing with chunk_size=1800
Chunk size 1800: 1.44 seconds

Summary: {1200: 1.359623908996582, 1800: 1.4435272216796875}


In [None]:
texts =[doc.page_content for doc in final_chunks]
start = time.time()
batches = [final_chunks[i:i + 256] for i in range(0, len(final_chunks), 64)]

texts = [doc.page_content for doc in batches[0]]
metadatas = [doc.metadata for doc in batches[0]]

embeddings = embedding_model.embed_documents(texts)
end = time.time()
end-start


2.954740047454834

### Testing Upserting time for different batches

In [None]:
start = time.time()
points = [
    {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {**metadata, "page_content": text}
    }

    for vector, metadata, text in zip(embeddings, metadatas, texts)
]

client.upsert(collection_name="collection_1", points=points)
end = time.time()
end-start

0.07278108596801758

### Continued

In [19]:
import time
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams

def embed_documents_to_qdrant(
    client,
    documents,
    embedding_model,
    batch_size=32,               # GPU embedding batch size
    max_workers=8,                # Threads for parallel embedding
    collection_name=COLLECTION_NAME,
    vector_dim=EMBED_DIMENSION,
    distance=DISTANCE,
    qdrant_url="http://localhost:6333"  # Use HTTP, gRPC optional
):

    # Split documents into batches
    batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

    # -------------------------------
    # Function to embed + prepare points
    # -------------------------------
    def process_batch(batch, batch_id):
        texts = [doc.page_content for doc in batch]
        metadatas = [doc.metadata for doc in batch]

        # Embed documents (GPU efficient)
        embeddings = embedding_model.embed_documents(texts)

        # Prepare points for Qdrant
        points = [
            {
                "id": str(uuid.uuid4()),
                "vector": vector,
                "payload": {**metadata, "page_content": text}
            }
            for vector, metadata, text in zip(embeddings, metadatas, texts)
        ]

        # Upsert to Qdrant
        client.upsert(collection_name=collection_name, points=points)

        return len(batch), batch_id

    # -------------------------------
    # Run threaded batches
    # -------------------------------
    start = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_batch, batch, i) for i, batch in enumerate(batches)]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Embedding batches"):
            count, batch_id = future.result()

    end = time.time()
    print(f"\n✅ Completed {len(documents)} chunks in {end - start:.2f} seconds")


In [20]:
embed_documents_to_qdrant(
    client,
    final_chunks,
    embedding_model,
)

Embedding batches: 100%|██████████| 315/315 [04:29<00:00,  1.17it/s]


✅ Completed 10072 chunks in 269.44 seconds





## Retriever

In [None]:
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
qdrant_url = "http://localhost:6333"
COLLECTION_NAME="Union_law_docs"

In [None]:

## Intialize Client
client = QdrantClient(
    url=qdrant_url,
    prefer_grpc=False 
)

#Create or Load VectorStore
vectorstore = QdrantVectorStore(
    client=client,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name=COLLECTION_NAME
)

In [None]:
#Create Retriever
retriever = vectorstore.as_retriever()

In [None]:
query="What is the penalty for me if I kill a wild animal"

In [None]:
retriever.invoke(query)


[Document(metadata={'_id': '0822205b-732c-4176-a1b9-faf3f76355f7', '_collection_name': 'Union_law_docs'}, page_content='Saving. - Nothing in this Act shall be deemed to apply to the capture or killing of a wild animal by any person in defence of himself or any other person, or to the capture or killing of any wild bird or animal in bona fide defence of property. The Schedule (i) Bustards, ducks, floricans, jungle fowl, partridges, peafowl, pheasants, pigeons quail, sand-grouse, painted snipe, spurfowl, wood-cock, nerons, egrets, rollers and kingfishers. (ii) Antelopes, asses, bison, buffaloes, deer, gazelles, goats, hares, oxen, rhinoceroses and sheep.'),
 Document(metadata={'_id': '16358dda-a79f-47c6-8fb1-f6d7ac0c1522', '_collection_name': 'Union_law_docs'}, page_content='7. Penalty for contravening section 3.-Whoever, in contravention of section 3, kills, injures or captures, or attempts to kill, injure or capture, any wild elephant, shall be punished with fine which may extend to fi