In [7]:
!pip install chromadb


Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-win_amd64.whl.metadata (262 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp310-cp310-win_amd64.whl.metadata (4.9 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.31.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  D

In [8]:
import chromadb
print(chromadb.__version__)


0.6.3


In [9]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121


In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")

# Load InLegalBERT Model
model_name = "law-ai/InLegalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


✅ Using device: cuda


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [2]:
import chromadb

# Initialize ChromaDB (persistent mode)
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create collection
collection = chroma_client.get_or_create_collection("legal_docs")


In [3]:
import numpy as np

def get_embedding(text):
    """Encodes the input text using InLegalBERT and returns a fixed-size embedding."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract CLS token
    return embedding.flatten()


In [4]:
import pandas as pd
import glob
from tqdm import tqdm

# ✅ Get all batch files (assuming they are named like processed_batch_0.parquet, processed_batch_1.parquet, etc.)
batch_files = sorted(glob.glob("processed_batch_*.parquet"))
print(f"📂 Found {len(batch_files)} batch files!")

# ✅ Process each batch file separately
for batch_id, batch_file in enumerate(batch_files):
    print(f"\n🚀 Processing {batch_file} ({batch_id + 1}/{len(batch_files)})...")

    # Load batch (only one at a time)
    batch_df = pd.read_parquet(batch_file)

    all_embeddings = []
    all_texts = []
    all_ids = []

    for i, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Embedding Batch {batch_id + 1}"):
        text = row["cleaned_text"]
        pos = row["pos_tags"]
        ner = row["named_entities"]

        # Combine text + POS + NER
        combined_text = f"{text} POS: {' '.join([t[1] for t in pos])} NER: {' '.join([t[1] for t in ner])}"
        
        # Compute embedding
        embedding = get_embedding(combined_text)
        
        # Store for ChromaDB
        all_embeddings.append(embedding.tolist())
        all_texts.append(text)
        all_ids.append(f"batch_{batch_id}_{i}")

    # Store batch in ChromaDB
    collection.add(ids=all_ids, documents=all_texts, embeddings=all_embeddings)

    print(f"✅ {batch_file} stored in ChromaDB successfully!")

print("\n🎉 All batches processed & stored successfully in ChromaDB!")

📂 Found 12 batch files!

🚀 Processing processed_batch_0.parquet (1/12)...


Embedding Batch 1: 100%|██████████| 1000/1000 [01:19<00:00, 12.58it/s]


✅ processed_batch_0.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_1.parquet (2/12)...


Embedding Batch 2: 100%|██████████| 1000/1000 [01:27<00:00, 11.49it/s]


✅ processed_batch_1.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_10.parquet (3/12)...


Embedding Batch 3: 100%|██████████| 1000/1000 [01:33<00:00, 10.75it/s]


✅ processed_batch_10.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_11.parquet (4/12)...


Embedding Batch 4: 100%|██████████| 970/970 [01:14<00:00, 12.97it/s]


✅ processed_batch_11.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_2.parquet (5/12)...


Embedding Batch 5: 100%|██████████| 1000/1000 [01:29<00:00, 11.17it/s]


✅ processed_batch_2.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_3.parquet (6/12)...


Embedding Batch 6: 100%|██████████| 1000/1000 [01:22<00:00, 12.05it/s]


✅ processed_batch_3.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_4.parquet (7/12)...


Embedding Batch 7: 100%|██████████| 1000/1000 [01:35<00:00, 10.51it/s]


✅ processed_batch_4.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_5.parquet (8/12)...


Embedding Batch 8: 100%|██████████| 1000/1000 [01:35<00:00, 10.52it/s]


✅ processed_batch_5.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_6.parquet (9/12)...


Embedding Batch 9: 100%|██████████| 1000/1000 [01:23<00:00, 11.93it/s]


✅ processed_batch_6.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_7.parquet (10/12)...


Embedding Batch 10: 100%|██████████| 1000/1000 [01:59<00:00,  8.36it/s]


✅ processed_batch_7.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_8.parquet (11/12)...


Embedding Batch 11: 100%|██████████| 1000/1000 [01:19<00:00, 12.64it/s]


✅ processed_batch_8.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_9.parquet (12/12)...


Embedding Batch 12: 100%|██████████| 1000/1000 [01:25<00:00, 11.72it/s]


✅ processed_batch_9.parquet stored in ChromaDB successfully!

🎉 All batches processed & stored successfully in ChromaDB!


In [5]:
print(f"✅ Total documents in ChromaDB: {collection.count()}")


✅ Total documents in ChromaDB: 11970


In [6]:
def search_similar_cases(query_text, top_k=5):
    """Finds the most similar cases in ChromaDB given a query."""
    query_embedding = get_embedding(query_text).tolist()
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    print("\n🔍 **Top Similar Cases:**")
    for i, doc in enumerate(results["documents"][0]):
        print(f"{i+1}. {doc[:200]}...")  # Show first 200 characters of case text

# Example query
search_similar_cases("tenant eviction under Section 39 of Delhi Rent Control Act")



🔍 **Top Similar Cases:**
1. JUDGMENT C.K. Mahajan, J. 1. This Civil Revision raises an interesting question of law with regard to landlord-tenant relationship in the post Rent Act era. 2. Premises No. L-3 Kanchanjunga, 18 Barakh...
2. JUDGMENT C.K. Mahajan, J. 1. This Civil Revision raises an interesting question of law with regard to landlord-tenant relationship in the post Rent Act era. 2. Premises No. L-3 Kanchanjunga, 18 Barakh...
3. PETITIONER: B. BANERJEE Vs. RESPONDENT: ANITA PAN DATE OF JUDGMENT20/11/1974 BENCH: KRISHNAIYER, V.R. BENCH: KRISHNAIYER, V.R. BEG, M. HAMEEDULLAH GOSWAMI, P.K. CITATION: 1975 AIR 1146 1975 SCR (2) 77...
4. JUDGMENT M.M. Kumar, J. 1. This is tenant's petition filed under Section 15(6) of the Haryana Urban (Control of Rent and Eviction) Act, 1973, challenging order of reversal dated 11.3.1999, passed by t...
5. IN THE COURT OF SHRI M. P. SINGH: SENIOR CIVIL JUDGE : RENT CONTROLLER: KARKARDOOMA COURTS (EAST), DELHI Suit No. 291/11 Unique Case ID No. 024

In [1]:
import pandas as pd
import glob
from tqdm import tqdm
import chromadb
from transformers import AutoTokenizer, AutoModel
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ Load InLegalBERT tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

# ✅ Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="legal_sentences")


In [3]:
def decode_tokens(tokenized_sentence):
    """Convert tokenized sentence back to readable text."""
    return tokenizer.decode(tokenized_sentence, skip_special_tokens=True)

def get_embedding(text):
    """Generate embeddings using InLegalBERT."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Extract CLS token embedding


In [4]:
# ✅ Get all batch files (processed_batch_*.parquet)
batch_files = sorted(glob.glob("processed_batch_*.parquet"))
print(f"📂 Found {len(batch_files)} batch files!")

# ✅ Process each batch file separately
for batch_id, batch_file in enumerate(batch_files):
    print(f"\n🚀 Processing {batch_file} ({batch_id + 1}/{len(batch_files)})...")

    # ✅ Load batch
    batch_df = pd.read_parquet(batch_file)

    all_sentence_embeddings = []
    all_sentence_texts = []
    all_sentence_ids = []

    # ✅ Process each case in the batch
    for i, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Embedding Sentences in Batch {batch_id + 1}"):
        case_id = f"batch_{batch_id}_{i}"  # ✅ Unique identifier for case-level mapping
        tokenized_sentences = row["tokenized_text"]  # ✅ List of tokenized sentences

        # ✅ Process each tokenized sentence
        for idx, tokenized_sentence in enumerate(tokenized_sentences):
            decoded_text = decode_tokens(tokenized_sentence)  # Convert back to text
            embedding = get_embedding(decoded_text)  # Compute embedding
            
            # ✅ Unique identifier (matches whole-case embedding structure)
            sentence_id = f"{case_id}_{idx}"

            all_sentence_embeddings.append(embedding.tolist())
            all_sentence_texts.append(decoded_text)
            all_sentence_ids.append(sentence_id)

    # ✅ Store batch of sentence embeddings in ChromaDB
    collection.add(ids=all_sentence_ids, documents=all_sentence_texts, embeddings=all_sentence_embeddings)

    print(f"✅ Sentence embeddings from {batch_file} stored in ChromaDB successfully!")

print("\n🎉 All sentence embeddings processed & stored successfully in ChromaDB!")


📂 Found 12 batch files!

🚀 Processing processed_batch_0.parquet (1/12)...


Embedding Sentences in Batch 1: 100%|██████████| 1000/1000 [08:52<00:00,  1.88it/s]


✅ Sentence embeddings from processed_batch_0.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_1.parquet (2/12)...


Embedding Sentences in Batch 2: 100%|██████████| 1000/1000 [05:13<00:00,  3.19it/s]


✅ Sentence embeddings from processed_batch_1.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_10.parquet (3/12)...


Embedding Sentences in Batch 3: 100%|██████████| 1000/1000 [05:42<00:00,  2.92it/s]


✅ Sentence embeddings from processed_batch_10.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_11.parquet (4/12)...


Embedding Sentences in Batch 4: 100%|██████████| 970/970 [12:17<00:00,  1.31it/s]


✅ Sentence embeddings from processed_batch_11.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_2.parquet (5/12)...


Embedding Sentences in Batch 5: 100%|██████████| 1000/1000 [08:06<00:00,  2.06it/s]


✅ Sentence embeddings from processed_batch_2.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_3.parquet (6/12)...


Embedding Sentences in Batch 6: 100%|██████████| 1000/1000 [06:13<00:00,  2.67it/s]


✅ Sentence embeddings from processed_batch_3.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_4.parquet (7/12)...


Embedding Sentences in Batch 7: 100%|██████████| 1000/1000 [06:20<00:00,  2.63it/s]


✅ Sentence embeddings from processed_batch_4.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_5.parquet (8/12)...


Embedding Sentences in Batch 8: 100%|██████████| 1000/1000 [07:18<00:00,  2.28it/s]


✅ Sentence embeddings from processed_batch_5.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_6.parquet (9/12)...


Embedding Sentences in Batch 9: 100%|██████████| 1000/1000 [08:58<00:00,  1.86it/s]


✅ Sentence embeddings from processed_batch_6.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_7.parquet (10/12)...


Embedding Sentences in Batch 10: 100%|██████████| 1000/1000 [17:19<00:00,  1.04s/it]


✅ Sentence embeddings from processed_batch_7.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_8.parquet (11/12)...


Embedding Sentences in Batch 11: 100%|██████████| 1000/1000 [1:17:23<00:00,  4.64s/it]    


✅ Sentence embeddings from processed_batch_8.parquet stored in ChromaDB successfully!

🚀 Processing processed_batch_9.parquet (12/12)...


Embedding Sentences in Batch 12: 100%|██████████| 1000/1000 [06:57<00:00,  2.39it/s]


✅ Sentence embeddings from processed_batch_9.parquet stored in ChromaDB successfully!

🎉 All sentence embeddings processed & stored successfully in ChromaDB!


In [3]:
import chromadb
import torch
from transformers import AutoModel, AutoTokenizer

# ✅ Load InLegalBert tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBert")
model = AutoModel.from_pretrained("law-ai/InLegalBert")

# ✅ Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Update with actual path
case_collection = chroma_client.get_collection("legal_docs")  # Whole-case embeddings
sentence_collection = chroma_client.get_collection("legal_sentences")  # Sentence embeddings

def embed_text(text):
    """Generate embeddings using InLegalBert"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Extract CLS token

def retrieve_dynamic_sentences(query, top_k_cases=5, max_tokens=3000):
    """
    Retrieves top relevant cases and extracts the most relevant sentences while keeping the token count under max_tokens.
    """
    # 🔹 Convert query to embedding using InLegalBert (matches stored format)
    query_embedding = embed_text(query)

    # 🔹 Retrieve the top K most relevant cases
    case_results = case_collection.query(query_embeddings=[query_embedding], n_results=top_k_cases)
    
    retrieved_cases = case_results["documents"][0]  # Extract top case texts
    case_ids = case_results["ids"][0]  # Extract their unique identifiers

    print(f"🔍 Retrieved {len(retrieved_cases)} relevant cases!")

    # 🔹 Retrieve sentences from the selected cases
    all_retrieved_sentences = []
    total_tokens = 0

    for case_id in case_ids:
        sentence_results = sentence_collection.query(query_embeddings=[query_embedding], n_results=10, where={"case_id": case_id})
        sentences = sentence_results["documents"][0]  # Extract relevant sentences

        for sent in sentences:
            sent_tokens = len(tokenizer.tokenize(sent))

            if total_tokens + sent_tokens <= max_tokens:
                all_retrieved_sentences.append(sent)
                total_tokens += sent_tokens
            else:
                break  # Stop if token limit is reached

        if total_tokens >= max_tokens:
            break  # Stop if token limit is reached across cases

    print(f"✅ Retrieved {len(all_retrieved_sentences)} sentences within {total_tokens} tokens.")
    return all_retrieved_sentences


# ✅ Example Query
query = "What are the legal provisions for forming a new state in India?"
retrieved_sentences = retrieve_dynamic_sentences(query, top_k_cases=5, max_tokens=3000)

print("\n🔹 Final Retrieved Sentences:\n")
for i, sent in enumerate(retrieved_sentences, 1):
    print(f"{i}. {sent}")


🔍 Retrieved 5 relevant cases!
✅ Retrieved 0 sentences within 0 tokens.

🔹 Final Retrieved Sentences:



In [4]:
print("🔍 Checking stored sentence count in ChromaDB...")
print(f"Total sentences stored: {sentence_collection.count()}")


🔍 Checking stored sentence count in ChromaDB...
Total sentences stored: 11970


In [6]:
query = "What are the legal provisions for forming a new state in India?"
query_embedding = embed_text(query)
sentence_results = sentence_collection.query(query_embeddings=[query_embedding], n_results=10)
print("\n📌 Retrieved Sentences (Without Filtering):", sentence_results["documents"])



📌 Retrieved Sentences (Without Filtering): [["petitioner : ramchandra keshav adke & ors vs. respondent : govind joti chavare and ors. date of judgment04 / 03 / 1975 bench : sarkaria, ranjit singh bench : sarkaria, ranjit singh chandrachud, y. v. gupta, a. c. citation : 1975 air 915 1975 scr ( 3 ) 839 1975 scc ( 1 ) 559 act : bombay tenancy act ( 67 of 1948 ) - - s. 5 ( 3 ) - - scope of. headnote : section 5 ( 3 ) ( b ) of the bombay tenancy act enacts that a tenant may terminate the tenancy at any time by surrendering his interest as a tenant in favour of the landlord provided that such surrender shall be in writing and shall be verified before the mamlatdar in the manner prescribed. rule 2 - a of the rules states that the mamlatdar, when verifying a surrender of a tenancy by a tenant, shall satisfy himself after such inquiry as he thinks fit, that the tenant understands the nature and consequences of the surrender and also that it is voluntary, and shall endorse his findings in that 