In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA



In [7]:
## Read the ppdfs from the folder
from langchain_community.document_loaders import UnstructuredPDFLoader


loader = UnstructuredPDFLoader(r"C:\Users\swanu\anaconda3\Scripts\RAG_GnSR\GnSR and Manual\KRCL G & SR 2020.pdf",mode="elements")
docs = loader.load()

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re
from typing import List, Optional, Tuple

def process_legal_documents(documents: List[Document]) -> List[Document]:
    """Process loaded PDF documents into structured legal chunks with reference extraction."""
    
    # Step 1: Group by rule number with enhanced pattern matching
    def group_by_rule(docs: List[Document]) -> List[Document]:
        # Pattern matches all rule types:
        # - S.R.2.24, G.R.3.15
        # - Standard rules: 2.03, 3.01
        # - Rule 4.05, Article 5.02, §6.01
        rule_pattern = re.compile(
            r"^(?:(?:S\.R\.|G\.R\.|Rule|Article|Section|§)?\s*)?"
            r"((?:[SG]\.R\.)?\d{1,2}\.\d{2}(?:\.\d+)?|\d{1,2}\-\d{2})", 
            re.IGNORECASE
        )
        grouped = []
        current_rule = None
        current_metadata = {}
        buffer = []

        for doc in docs:
            text = doc.page_content.strip()
            if not text:
                continue

            if not current_metadata:
                current_metadata = doc.metadata.copy()

            match = rule_pattern.match(text)
            if match:
                if current_rule and buffer:
                    grouped.append(create_rule_document(buffer, current_rule, current_metadata))
                current_rule = match.group(1)
                # Normalize rule format
                current_rule = current_rule.replace(' ', '')  # Remove spaces in S.R./G.R.
                buffer = [text]
                current_metadata = doc.metadata.copy()
            else:
                buffer.append(text)

        if current_rule and buffer:
            grouped.append(create_rule_document(buffer, current_rule, current_metadata))

        return grouped

    def create_rule_document(content: List[str], rule: str, metadata: dict) -> Document:
        """Create a rule document with proper metadata."""
        rule_type = "SR" if rule.startswith('S.R.') else \
                   "GR" if rule.startswith('G.R.') else "Standard"
        
        metadata = metadata.copy()
        metadata.update({
            "rule": rule,
            "rule_type": rule_type,
            "source": metadata.get("source", "KRCL G & SR 2020.pdf"),
            "document_type": "legal_rule"
        })
        return Document(
            page_content="\n".join(content),
            metadata=metadata
        )

    # Step 2: Group the documents by rule
    grouped_docs = group_by_rule(documents)

    # Step 3: Merge small rules intelligently
    def merge_small_rules(docs: List[Document], min_length: int = 100) -> List[Document]:
        merged = []
        previous_doc = None

        for i, doc in enumerate(docs):
            # Handle first document case
            if i == 0 and len(doc.page_content) < min_length:
                if len(docs) > 1 and docs[i+1].metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    # Merge forward with next document
                    docs[i+1].page_content = doc.page_content + "\n\n" + docs[i+1].page_content
                    docs[i+1].metadata["combined_rules"] = (
                        doc.metadata["rule"] + "; " + 
                        docs[i+1].metadata.get("combined_rules", docs[i+1].metadata["rule"])
                    )
                    continue
                merged.append(doc)
                continue

            if previous_doc and len(doc.page_content) < min_length:
                # Only merge if same rule type
                if previous_doc.metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    previous_doc.page_content += "\n\n" + doc.page_content
                    previous_doc.metadata["combined_rules"] = (
                        previous_doc.metadata.get("combined_rules", previous_doc.metadata["rule"]) + 
                        f"; {doc.metadata['rule']}"
                    )
                    continue
                else:
                    merged.append(previous_doc)
                    previous_doc = doc
                    continue
            else:
                if previous_doc:
                    merged.append(previous_doc)
                previous_doc = doc

        if previous_doc:
            merged.append(previous_doc)

        return merged

    merged_docs = merge_small_rules(grouped_docs)

    # Step 4: Split long rules with metadata preservation and reference extraction
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        length_function=len,
        keep_separator=True
    )

    final_documents = []
    for doc in merged_docs:
        try:
            splits = splitter.split_documents([doc])
            
            # Extract references from each split
            reference_pattern = re.compile(
                r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
            )
            
            for chunk_idx, split in enumerate(splits, 1):
                # Extract references
                references = reference_pattern.findall(split.page_content)
                if references:
                    split.metadata["references"] = [f"{sr} of {gr}" for sr, gr in references]
                
                # Merge metadata with priority to split-specific values
                split.metadata = {
                    **doc.metadata,
                    **split.metadata,  # Preserves extracted references
                    "chunk_id": f"{doc.metadata['rule']}_chunk_{chunk_idx}",
                    "total_chunks": len(splits),
                    "chunk_number": chunk_idx
                }
            
            final_documents.extend(splits)
        except Exception as e:
            print(f"Error splitting rule {doc.metadata['rule']}: {str(e)}")
            doc.metadata.update({
                "chunk_id": f"{doc.metadata['rule']}_chunk_1",
                "total_chunks": 1,
                "chunk_number": 1,
                "references": extract_references(doc.page_content)  # Extract even if not split
            })
            final_documents.append(doc)

    return final_documents

def extract_references(text: str) -> List[str]:
    """Extract cross-references from text."""
    reference_pattern = re.compile(
        r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
    )
    references = reference_pattern.findall(text)
    return [f"{sr} of {gr}" for sr, gr in references]

# Process documents
final_documents = process_legal_documents(docs)

final_documents[0]

Document(metadata={'source': 'C:\\Users\\swanu\\anaconda3\\Scripts\\RAG_GnSR\\GnSR and Manual\\KRCL G & SR 2020.pdf', 'coordinates': {'points': ((15.0, 164.29199284452147), (15.0, 197.29004359452148), (161.778320230542, 197.29004359452148), (161.778320230542, 164.29199284452147)), 'system': 'PixelSpace', 'layout_width': 510.0, 'layout_height': 680.0}, 'file_directory': 'C:\\Users\\swanu\\anaconda3\\Scripts\\RAG_GnSR\\GnSR and Manual', 'filename': 'KRCL G & SR 2020.pdf', 'languages': ['eng'], 'last_modified': '2025-06-24T23:09:43', 'page_number': 6, 'filetype': 'application/pdf', 'category': 'Title', 'element_id': '6c056a3a6296695a75b3afd832abdfa3', 'rule': '1.01', 'rule_type': 'Standard', 'document_type': 'legal_rule', 'chunk_id': '1.01_chunk_1', 'total_chunks': 1, 'chunk_number': 1}, page_content='1.01 1.02 Definitions 1.03 Classification of stations\nShort title and commencement\nCHAPTER II\nRULES APPLYING TO RAILWAY SERVANTS GENERALLY')

In [8]:
len(final_documents)

1695

In [10]:
## Embedding Using Huggingface
from langchain.embeddings import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"},  # This enables GPU acceleration
)

In [11]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-1.54984603e-02 -1.32178171e-02  9.28688720e-02  1.21875275e-02
 -2.52666008e-02  1.15616992e-01  5.61870774e-03  3.02802511e-02
 -1.37848243e-01 -5.20090945e-02 -4.14693728e-02  5.04869677e-04
  3.97413708e-02 -6.86438382e-02 -4.11508232e-02 -3.02156173e-02
  4.04950157e-02 -2.42434852e-02  3.48446928e-02  3.09494361e-02
  7.56424367e-02  1.07497722e-02 -9.58742797e-02  4.69163731e-02
 -5.06225834e-03 -4.95175608e-02 -2.27279272e-02  9.62005407e-02
  1.49883777e-02 -4.47839824e-03 -6.16348088e-02  5.39363101e-02
  1.13636062e-01  5.06605543e-02  4.51007998e-03  4.06036153e-02
  3.07764746e-02 -1.54581247e-02  2.92374920e-02  6.98646083e-02
 -4.87504564e-02 -9.59559903e-02 -6.63275942e-02  1.08579598e-01
  6.43876418e-02  2.46755537e-02  1.74940974e-02 -9.55154225e-02
 -1.26703352e-01 -1.73531962e-03 -3.29455733e-02  2.66789533e-02
  3.45032779e-03  9.94323716e-02 -4.55959961e-02 -1.46388300e-02
 -4.57849689e-02  8.48745089e-03  8.64446349e-03  8.00068304e-02
 -6.13888241e-02 -3.89190

In [12]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [14]:
## Query using Similarity Search
query="WHAT IS Train Parting?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

16.01 Knowledge of signals 16.02 Supply and care of equipment 16.03 Road traffic 16.04 Gateman to observe passing trains 16.05 Channel for flange of wheels 16.06 Defects at level crossings 16.07 Obstructions at level crossings 16.08 Parting of a train 16.09 Trespassing 16.10 Transfer of charge of gate 16.11 Height gauges
336xvii 336 337 352 355
355
364 375 375 375 375 376 376 376 376
377
377 378 378 379 379 381 381 397 401 401 403
404-413
404
404 404 406 409 410 410 410 412 413 413 413
xviii
CHAPTER XVII


In [15]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002F03B206810> search_kwargs={'k': 3}


In [16]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="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"

The Hugging Face Hub is an platform with over 350k models, 75k datasets, and 150k demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together.

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load FLAN-T5
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Prepare query
query = "What is the protection of trains?"

inputs = tokenizer(query, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=150)

# Decode and print response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

rails


In [None]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

In [22]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [24]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [25]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [28]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [29]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr