## Analysis

In [5]:
import os

In [20]:
files = os.listdir("../scraped_data/stateacts")

In [21]:
import json
import re
import matplotlib.pyplot as plt

In [23]:
doc_count = 0
for file in files:
    with open(f"../scraped_data/stateacts/{file}","r",encoding="utf-8") as f:
        x = json.load(f)
    years = list(x.keys())[:-1]
    doc_len = [ (doc["doc_id"]) for year in years for doc in x[year]]
    doc_count += len(doc_len)
doc_count

14225

In [12]:
years = list(x.keys())[:-1]
doc_len = [ (year,doc["doc_id"]) for year in years for doc in x[year]]


In [13]:
len(doc_len)

9

## Make IndiaLaws docs as a single file

In [36]:
import os
import json
path="../scraped_data/indialaws"
files = os.listdir(path)

In [40]:
from datapreprocessing.json_to_docs import convert_json_to_docs
india_law_docs = []
for file in files:
    india_law_docs.extend(convert_json_to_docs(path+"/"+file))

Processing: 100%|██████████| 52/52 [00:00<00:00, 856.26it/s]
Processing: 100%|██████████| 31/31 [00:00<00:00, 381.33it/s]
Processing: 100%|██████████| 166/166 [00:05<00:00, 32.43it/s]
Processing: 100%|██████████| 9/9 [00:00<00:00, 592.24it/s]


In [42]:
len(india_law_docs)

55153

## Random Docs Selections from JSON


In [3]:
import json
import numpy as np
import random

In [4]:
def randomly_select_docs(json_file,return_json_filename,sampling_size):
    with open(json_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    random_dict = {}
    years = list(raw_data.keys())[:-1]
    np.random.shuffle(years)
    for year in years:
        k = min(sampling_size, len(raw_data[year]))
        random_dict[year] = random.sample(raw_data[year],k)
    with open(f"./{return_json_filename}.json","w",encoding="utf-8") as f:
        json.dump(random_dict,f,indent=4,ensure_ascii=False)

## JSON to Documents

In [19]:
from datapreprocessing.json_to_docs import convert_json_to_docs
import os
import json
path="../scraped_data/indialaws"
files = os.listdir(path)

In [20]:
documents = []
for file in files:
    documents.extend(convert_json_to_docs(path+"/"+file))

Processing: 100%|██████████| 52/52 [00:00<00:00, 890.04it/s]
Processing: 100%|██████████| 31/31 [00:00<00:00, 369.69it/s]
Processing: 100%|██████████| 166/166 [00:05<00:00, 31.95it/s]
Processing: 100%|██████████| 9/9 [00:00<00:00, 633.31it/s]


## Chunking

In [21]:
from datapreprocessing.docs_to_chunks import convert_docs_to_chunks

In [22]:
chunks = convert_docs_to_chunks(documents)

Processing: 100%|██████████| 55153/55153 [00:19<00:00, 2835.36it/s]


## Context Enrichment

### Merge Shorter Chunks

In [23]:
from datapreprocessing.chunks_to_chunks import merge_shorter_chunks

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
chunks = merge_shorter_chunks(chunks,chunk_overlap=250,threshold=1000)

Before Processing docs:  [548, 557, 560, 603, 613, 618, 620, 622, 634, 641, 654, 656, 660, 670, 682, 685, 689, 705, 711, 741, 790, 815, 832, 837, 850, 860, 863, 866, 876, 881, 887, 891, 893, 901, 904, 906, 909, 924, 941, 944, 953, 957, 960, 963, 970, 974, 976, 981, 989, 993, 996, 1002, 1004, 1017, 1027, 1043, 1056, 1061, 1070, 1101, 1142, 1434, 1470, 1488, 1495, 1502, 1507, 1512, 1558, 1580, 1692, 1726, 1995, 2006, 2016, 2080, 2111, 2125, 2142, 2152, 2165, 2178, 2215, 2219, 2285, 2317, 2328, 2339, 2352, 2362, 2402, 2419, 2428, 2452, 2462, 2490, 2516, 2543, 2629, 2640, 2661, 2684, 2739, 2744, 2752, 2766, 2769, 2774, 2777, 2799, 2802, 2815, 2817, 2878, 3122, 3198, 3202, 3233, 3256, 3299, 3301, 3303, 3328, 3376, 3436, 3722, 3728, 3737, 3740, 3747, 3787, 3802, 3816, 3825, 3834, 3839, 3890, 3935, 3982, 4091, 4137, 4173, 4182, 4195, 4235, 4243, 4252, 4261, 4295, 4307, 4312, 4356, 4361, 4431, 4537, 4542, 4559, 4594, 4618, 4641, 5696, 5751, 5758, 5817, 5826, 6020, 6052, 6081, 6095, 6180, 6184,

Merging: 100%|██████████| 2147/2147 [00:00<00:00, 7902.08it/s] 


Before Processing docs:  []


### Add Chunk Header

In [25]:
from datapreprocessing.chunks_to_chunks import inject_context_to_chunks_parallel , process_chunk

In [26]:
chunks = inject_context_to_chunks_parallel(chunks)

Processing: 100%|██████████| 106002/106002 [01:17<00:00, 1372.90it/s]


## Embedding

In [5]:
from langchain_ollama import OllamaEmbeddings
from datapreprocessing.chunks_to_embedding import embed_documents_to_qdrant

In [10]:
qdrant_url = "http://localhost:6333"
EMBED_DIMENSION=768
DISTANCE="Cosine"
COLLECTION_NAME="india_laws_2"

embedding_model = OllamaEmbeddings(model="nomic-embed-text")

In [11]:
embed_documents_to_qdrant(
    chunks[3:6],
    embedding_model,
    collection_name=COLLECTION_NAME,
    vector_dim=EMBED_DIMENSION,
    distance=DISTANCE
)

Embedding batches: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]


✅ Completed 3 chunks in 0.30 seconds





In [18]:
from qdrant_client import QdrantClient

client = QdrantClient("http://localhost:6333")

def copy_points(src, dst):
    scroll, _ = client.scroll(src, limit=100)
    while scroll:
        client.upsert(dst, points=scroll)
        scroll, _ = client.scroll(src, limit=100, offset=scroll[-1].id)

copy_points("india_laws_2", "india_laws_merged")
copy_points("india_laws_1", "india_laws_merged")


ValidationError: 3 validation errors for PointsList
points.0
  Input should be a valid dictionary or instance of PointStruct [type=model_type, input_value=Record(id='47fecafe-3477-...=None, order_value=None), input_type=Record]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
points.1
  Input should be a valid dictionary or instance of PointStruct [type=model_type, input_value=Record(id='51953ae7-532e-...=None, order_value=None), input_type=Record]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type
points.2
  Input should be a valid dictionary or instance of PointStruct [type=model_type, input_value=Record(id='589ae6f6-0093-...=None, order_value=None), input_type=Record]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type

## Retriever

In [287]:
from typing import List
from langchain.schema import Document
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
import numpy as np
import pickle
import json
from sentence_transformers import CrossEncoder
qdrant_url = "http://localhost:6333"
COLLECTION_NAME="india_laws"

## Intialize Client
client = QdrantClient(
    url=qdrant_url,
    prefer_grpc=False 
)

collection_info = client.get_collection(collection_name=COLLECTION_NAME)

embedding_model = OllamaEmbeddings(model="nomic-embed-text")

top_k = 50

### querying vectorstore

In [286]:
query = "I Recorded a music in my own studio but i found out that it was leaked by a bug placed in my studio, i dont know who did it but i have evidence that i record that music first how can i file a complaint for greivance" 

In [278]:
points = vectorstore.client.query_points(
    collection_name="india_laws",
    query=embedding_model.embed_query(query),
    limit=top_k,
    with_payload=True,  # ✅ ensures full payload
).points


In [None]:
for i in points[:30]:
    print(i.payload["page_content"])

In [291]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [292]:
pairs = [[query,point.payload["page_content"]] for point in points ] 

In [294]:
scores = cross_encoder.predict(pairs)

In [297]:
scored_docs = sorted(zip(points, scores), key=lambda x: x[1], reverse=True)

In [298]:
scored_docs

[(ScoredPoint(id='ac135aab-d9d3-4419-a5b7-d9237ffaeaee', version=638, score=0.6134422, payload={'doc_id': 'af58dac4-e9cb-4e3e-9943-d1cbe17bae70', 'year': '1957', 'link': 'https://indiankanoon.org/doc/1136195/', 'title': 'The Copyright Act, 1957', 'cited_to': 0, 'cited_by': 3366, 'tags': 'Shipping, Transport, Intellectual Property, Telecom', 'jurisdiction': 'Central', 'source': 'Union of India - Act', 'doctype': 'Act', 'pageno': 15, 'chunk_id': 20443, 'page_content': '\nThis chunk is extracted from the legal document: The Copyright Act, 1957, cited by 3366.\nSource: Union of India - Act | Jurisdiction: Central | Type: Act\nDocument link: https://indiankanoon.org/doc/1136195/\ntags: Shipping, Transport, Intellectual Property, Telecom\nkeywords:order, full, may pas, explanation .—, respect, paid, sound recording shall, five calendar year, lite, statutory licence, payment, including full detail, register, made, order ex, prescribed, owner, may, broadcast, making, cover version, copyright b

In [299]:
[print(point.payload["page_content"]) for point,score in scored_docs[:10] ]


This chunk is extracted from the legal document: The Copyright Act, 1957, cited by 3366.
Source: Union of India - Act | Jurisdiction: Central | Type: Act
Document link: https://indiankanoon.org/doc/1136195/
tags: Shipping, Transport, Intellectual Property, Telecom
keywords:order, full, may pas, explanation .—, respect, paid, sound recording shall, five calendar year, lite, statutory licence, payment, including full detail, register, made, order ex, prescribed, owner, may, broadcast, making, cover version, copyright board may, inquiry, potential circulation, pursuance, existing stock, broadcasting, mean, may deem fit, way, 31d, performance, work, literary, satisfied, parte directing, royalty, prima facie, minimum, complaint, inspect, broadcasting organisation desirous, first sound recording, cease, duly authorised agent, general order, make, sound recording, musical work, communicating, sound recording .—, accordance, particular language, expiration, dialect, copy, right, account relat

[None, None, None, None, None, None, None, None, None, None]