### RAG Pipeline - Data Ingestion to Vector DB

In [1]:
import os
import pandas as pd
import numpy as np

from pathlib import Path
from typing import List, Dict, Any, Tuple
import chromadb
from chromadb.config import Settings
import uuid
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv

from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage

load_dotenv()

True

In [2]:
def process_cases_from_csv(csv_file_path):
    """Process cases from CSV file"""
    # Using pandas for better control over metadata
    df = pd.read_csv(csv_file_path)
    all_documents = []
    
    for index, row in df.iterrows():
        document = Document(
            page_content=row['content'],
            metadata={
                'case_id': str(row['id']),
                'title': row['title'],
                'source_file': 'cases_csv',
                'file_type': 'case',
                'document_type': 'legal_case'
            }
        )
        all_documents.append(document)
    
    print(f"Processed {len(all_documents)} cases from CSV")
    return all_documents

In [None]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    
    for doc in documents:
        doc.metadata.update({
            'case_id': doc.metadata.get('case_id', ''),
            'case_title': doc.metadata.get('title', ''),
            'document_type': 'legal_case',
            'source': 'csv_database'
        })
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")


    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

### Embedding

In [None]:
class EmbeddingManager:
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


embedding_manager = EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


### VectorStore

In [None]:
class VectorStore:
    
    def __init__(self, collection_name: str = "legal_cases", persist_directory: str = "./chroma_db"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Legal case embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray, batch_size: int = 5000):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store in batches of {batch_size}...")

        total_added = 0
        
        for batch_start in range(0, len(documents), batch_size):
            batch_end = min(batch_start + batch_size, len(documents))
            batch_docs = documents[batch_start:batch_end]
            batch_embeddings = embeddings[batch_start:batch_end]
            
            ids = []
            metadatas = []
            documents_text = []
            embeddings_list = []

            for i, (doc, embedding) in enumerate(zip(batch_docs, batch_embeddings)):
                doc_id = f"doc_{uuid.uuid4().hex[:8]}_{batch_start + i}"
                ids.append(doc_id)

                metadata = dict(doc.metadata)
                metadata['doc_index'] = batch_start + i
                metadata['content_length'] = len(doc.page_content)
                metadatas.append(metadata)

                documents_text.append(doc.page_content)

                embeddings_list.append(embedding.tolist())

            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                total_added += len(batch_docs)
                print(f"Successfully added batch {batch_start//batch_size + 1}: {len(batch_docs)} documents (Total: {total_added}/{len(documents)})")
                
            except Exception as e:
                print(f"Error adding batch {batch_start//batch_size + 1} to vector store: {e}")
                raise

        print(f"Completed! Total documents in collection: {self.collection.count()}")


vectorstore = VectorStore()

Vector store initialized. Collection: legal_cases
Existing documents in collection: 0


In [8]:
# Process cases from CSV
case_documents = process_cases_from_csv(r"V:\RAG\data\710edited.csv")

# Split into chunks
chunks = split_documents(case_documents)

# Generate embeddings
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Store in vector database with batch processing
vectorstore.add_documents(chunks, embeddings, batch_size=5000)

Processed 710 cases from CSV
Split 710 documents into 43713 chunks

Example chunk:
Content: J.B. PARDIWALA, J.
For the convenience of the exposition, this judgment is divided in the following

parts:

8, CONSTITUENT ASSEMB. DEB., (May 30, 1949) 431.
W.P. (C) No. 1239 of 2023                 ...
Metadata: {'case_id': '82729634', 'title': 'The State Of Tamil Nadu vs The Governor Of Tamilnadu on 8 April, 2025', 'source_file': 'cases_csv', 'file_type': 'case', 'document_type': 'legal_case', 'case_title': 'The State Of Tamil Nadu vs The Governor Of Tamilnadu on 8 April, 2025', 'source': 'csv_database'}
Generating embeddings for 43713 texts...


Batches:   0%|          | 0/1367 [00:00<?, ?it/s]

Generated embeddings with shape: (43713, 384)
Adding 43713 documents to vector store in batches of 5000...
Successfully added batch 1: 5000 documents (Total: 5000/43713)
Successfully added batch 2: 5000 documents (Total: 10000/43713)
Successfully added batch 3: 5000 documents (Total: 15000/43713)
Successfully added batch 4: 5000 documents (Total: 20000/43713)
Successfully added batch 5: 5000 documents (Total: 25000/43713)
Successfully added batch 6: 5000 documents (Total: 30000/43713)
Successfully added batch 7: 5000 documents (Total: 35000/43713)
Successfully added batch 8: 5000 documents (Total: 40000/43713)
Successfully added batch 9: 3713 documents (Total: 43713/43713)
Completed! Total documents in collection: 43713


### Retriever Pipeline From VectorStore

In [12]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.3):
        """Retrieve relevant documents for a query"""
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                # DEBUG: Show raw scores
                print("=== Similarity Scores ===")
                for i, distance in enumerate(distances):
                    similarity_score = 1 - (distance / 2)  # Correct conversion
                    print(f"Document {i+1}: distance={distance:.4f}, similarity={similarity_score:.4f}")

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score
                    similarity_score = 1 - (distance / 2)  # Correct conversion
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found in results")
                return []

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

# Reinitialize retriever with fixed class
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

### RAG Pipeline - VectorDB To LLM Output Generation

In [17]:
class GroqLLM:
    def __init__(self, model_name: str = "llama-3.1-8b-instant", api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        
        if not self.api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable")

        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )

        print(f"Initialized Groq LLM with model: {self.model_name}")

    def generate_response(self, query: str, context: str) -> str:
        """Generate response using retrieved context"""
        prompt_template = PromptTemplate(
            input_variables=["context", "question"], 
            template="""You are a legal AI assistant specializing in case law analysis. Use the following legal case context to answer the question accurately and concisely.

Context:
{context}

Question: {question}

Answer: Provide a clear, informative legal answer based strictly on the context above. If the context doesn't contain relevant information, state that clearly."""
        )

        try:
            formatted_prompt = prompt_template.format(context=context, question=query)
            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            return f"Error generating response: {str(e)}"

# Initialize LLM
try:
    groq_llm = GroqLLM(model_name="llama-3.1-8b-instant", api_key=os.getenv("GROQ_API_KEY"))
    print("Groq LLM initialized successfully!")
except ValueError as e:
    print(f"Warning: {e}")
    groq_llm = None

Initialized Groq LLM with model: llama-3.1-8b-instant
Groq LLM initialized successfully!


In [19]:
def rag_simple(query, retriever, llm, top_k=3, score_threshold=0.5):
    """Simple RAG function: retrieve context + generate response"""
    # Retrieve the context
    results = retriever.retrieve(query, top_k=top_k, score_threshold=score_threshold)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    
    if not context:
        return "No relevant legal context found to answer the question."

    # Generate answer using LLM
    prompt = f"""Use the following legal case context to answer the question concisely:

Context:
{context}

Question: {query}

Answer:"""
    
    try:
        response = llm.llm.invoke([HumanMessage(content=prompt)])
        return response.content
    except Exception as e:
        return f"Error: {str(e)}"

# Test the RAG pipeline
if groq_llm:
    answer = rag_simple("What is the main legal issue in this case?", rag_retriever, groq_llm, top_k=5, score_threshold=0.5)
    print(answer)

Retrieving documents for query: 'What is the main legal issue in this case?'
Top K: 5, Score threshold: 0.5
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (1, 384)
=== Similarity Scores ===
Document 1: distance=0.8641, similarity=0.5680
Document 2: distance=0.8655, similarity=0.5673
Document 3: distance=0.9002, similarity=0.5499
Document 4: distance=0.9121, similarity=0.5440
Document 5: distance=0.9145, similarity=0.5428
Retrieved 5 documents (after filtering)
The main legal issue in this case is whether the Magistrate was justified in exercising his inherent jurisdiction under Section 482 Cr.P.C. to prevent abuse of the process of court and to secure the ends of justice, particularly in a situation where the civil proceedings are pending and the dispute is primarily of a civil nature.
