## Analysis

In [5]:
import os

In [20]:
files = os.listdir("../scraped_data/stateacts")

In [21]:
import json
import re
import matplotlib.pyplot as plt

In [23]:
doc_count = 0
for file in files:
    with open(f"../scraped_data/stateacts/{file}","r",encoding="utf-8") as f:
        x = json.load(f)
    years = list(x.keys())[:-1]
    doc_len = [ (doc["doc_id"]) for year in years for doc in x[year]]
    doc_count += len(doc_len)
doc_count

14225

In [12]:
years = list(x.keys())[:-1]
doc_len = [ (year,doc["doc_id"]) for year in years for doc in x[year]]


In [13]:
len(doc_len)

9

## Make IndiaLaws docs as a single file

In [36]:
import os
import json
path="../scraped_data/indialaws"
files = os.listdir(path)

In [40]:
from datapreprocessing.json_to_docs import convert_json_to_docs
india_law_docs = []
for file in files:
    india_law_docs.extend(convert_json_to_docs(path+"/"+file))

Processing: 100%|██████████| 52/52 [00:00<00:00, 856.26it/s]
Processing: 100%|██████████| 31/31 [00:00<00:00, 381.33it/s]
Processing: 100%|██████████| 166/166 [00:05<00:00, 32.43it/s]
Processing: 100%|██████████| 9/9 [00:00<00:00, 592.24it/s]


In [42]:
len(india_law_docs)

55153

## Random Docs Selections from JSON


In [3]:
import json
import numpy as np
import random

In [4]:
def randomly_select_docs(json_file,return_json_filename,sampling_size):
    with open(json_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    random_dict = {}
    years = list(raw_data.keys())[:-1]
    np.random.shuffle(years)
    for year in years:
        k = min(sampling_size, len(raw_data[year]))
        random_dict[year] = random.sample(raw_data[year],k)
    with open(f"./{return_json_filename}.json","w",encoding="utf-8") as f:
        json.dump(random_dict,f,indent=4,ensure_ascii=False)

## JSON to Documents

In [43]:
from datapreprocessing.json_to_docs import convert_json_to_docs
import os
import json
path="../scraped_data/indialaws"
files = os.listdir(path)

In [44]:
documents = []
for file in files:
    documents.extend(convert_json_to_docs(path+"/"+file))

Processing: 100%|██████████| 52/52 [00:00<00:00, 925.56it/s]
Processing: 100%|██████████| 31/31 [00:00<00:00, 359.45it/s]
Processing: 100%|██████████| 166/166 [00:05<00:00, 32.72it/s]
Processing: 100%|██████████| 9/9 [00:00<00:00, 524.68it/s]


## Chunking

In [45]:
from datapreprocessing.docs_to_chunks import convert_docs_to_chunks

In [46]:
chunks = convert_docs_to_chunks(documents)

Processing: 100%|██████████| 55153/55153 [00:17<00:00, 3082.70it/s]


## Context Enrichment

### Merge Shorter Chunks

In [47]:
from datapreprocessing.chunks_to_chunks import merge_shorter_chunks

In [48]:
chunks = merge_shorter_chunks(chunks,chunk_overlap=250,threshold=1000)

Before Processing docs:  [548, 557, 560, 603, 613, 618, 620, 622, 634, 641, 654, 656, 660, 670, 682, 685, 689, 705, 711, 741, 790, 815, 832, 837, 850, 860, 863, 866, 876, 881, 887, 891, 893, 901, 904, 906, 909, 924, 941, 944, 953, 957, 960, 963, 970, 974, 976, 981, 989, 993, 996, 1002, 1004, 1017, 1027, 1043, 1056, 1061, 1070, 1101, 1142, 1434, 1470, 1488, 1495, 1502, 1507, 1512, 1558, 1580, 1692, 1726, 1995, 2006, 2016, 2080, 2111, 2125, 2142, 2152, 2165, 2178, 2215, 2219, 2285, 2317, 2328, 2339, 2352, 2362, 2402, 2419, 2428, 2452, 2462, 2490, 2516, 2543, 2629, 2640, 2661, 2684, 2739, 2744, 2752, 2766, 2769, 2774, 2777, 2799, 2802, 2815, 2817, 2878, 3122, 3198, 3202, 3233, 3256, 3299, 3301, 3303, 3328, 3376, 3436, 3722, 3728, 3737, 3740, 3747, 3787, 3802, 3816, 3825, 3834, 3839, 3890, 3935, 3982, 4091, 4137, 4173, 4182, 4195, 4235, 4243, 4252, 4261, 4295, 4307, 4312, 4356, 4361, 4431, 4537, 4542, 4559, 4594, 4618, 4641, 5696, 5751, 5758, 5817, 5826, 6020, 6052, 6081, 6095, 6180, 6184,

Merging: 100%|██████████| 2147/2147 [00:00<00:00, 8005.27it/s] 

Before Processing docs:  []





### Add Chunk Header

In [50]:
from datapreprocessing.chunks_to_chunks import inject_context_to_chunks_parallel , process_chunk

In [51]:
chunks = inject_context_to_chunks_parallel(chunks)

Processing: 100%|██████████| 106002/106002 [01:09<00:00, 1520.46it/s]


## Embedding

In [57]:
from langchain_ollama import OllamaEmbeddings
from datapreprocessing.chunks_to_embedding import embed_documents_to_qdrant

In [58]:
qdrant_url = "http://localhost:6333"
EMBED_DIMENSION=768
DISTANCE="Cosine"
COLLECTION_NAME="Union_law_docs"

embedding_model = OllamaEmbeddings(model="nomic-embed-text")

In [None]:
embed_documents_to_qdrant(
    chunks,
    embedding_model,
    collection_name=COLLECTION_NAME,
    vector_dim=EMBED_DIMENSION,
    distance=DISTANCE
)

Embedding batches: 100%|██████████| 315/315 [04:29<00:00,  1.17it/s]


✅ Completed 10072 chunks in 269.44 seconds





## Retriever

In [None]:
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
qdrant_url = "http://localhost:6333"
COLLECTION_NAME="Union_law_docs"

In [None]:

## Intialize Client
client = QdrantClient(
    url=qdrant_url,
    prefer_grpc=False 
)

#Create or Load VectorStore
vectorstore = QdrantVectorStore(
    client=client,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name=COLLECTION_NAME
)

In [None]:
#Create Retriever
retriever = vectorstore.as_retriever()

In [None]:
query="What is the penalty for me if I kill a wild animal"

In [None]:
retriever.invoke(query)


[Document(metadata={'_id': '0822205b-732c-4176-a1b9-faf3f76355f7', '_collection_name': 'Union_law_docs'}, page_content='Saving. - Nothing in this Act shall be deemed to apply to the capture or killing of a wild animal by any person in defence of himself or any other person, or to the capture or killing of any wild bird or animal in bona fide defence of property. The Schedule (i) Bustards, ducks, floricans, jungle fowl, partridges, peafowl, pheasants, pigeons quail, sand-grouse, painted snipe, spurfowl, wood-cock, nerons, egrets, rollers and kingfishers. (ii) Antelopes, asses, bison, buffaloes, deer, gazelles, goats, hares, oxen, rhinoceroses and sheep.'),
 Document(metadata={'_id': '16358dda-a79f-47c6-8fb1-f6d7ac0c1522', '_collection_name': 'Union_law_docs'}, page_content='7. Penalty for contravening section 3.-Whoever, in contravention of section 3, kills, injures or captures, or attempts to kill, injure or capture, any wild elephant, shall be punished with fine which may extend to fi