# Retrival and Chain


In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader("KRCL G & SR 2020.pdf", mode="elements")
docs = loader.load()

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re
from typing import List, Optional, Tuple

def process_legal_documents(documents: List[Document]) -> List[Document]:
    """Process loaded PDF documents into structured legal chunks with reference extraction."""
    
    # Step 1: Group by rule number with enhanced pattern matching
    def group_by_rule(docs: List[Document]) -> List[Document]:
        # Pattern matches all rule types:
        # - S.R.2.24, G.R.3.15
        # - Standard rules: 2.03, 3.01
        # - Rule 4.05, Article 5.02, §6.01
        rule_pattern = re.compile(
            r"^(?:(?:S\.R\.|G\.R\.|Rule|Article|Section|§)?\s*)?"
            r"((?:[SG]\.R\.)?\d{1,2}\.\d{2}(?:\.\d+)?|\d{1,2}\-\d{2})", 
            re.IGNORECASE
        )
        grouped = []
        current_rule = None
        current_metadata = {}
        buffer = []

        for doc in docs:
            text = doc.page_content.strip()
            if not text:
                continue

            if not current_metadata:
                current_metadata = doc.metadata.copy()

            match = rule_pattern.match(text)
            if match:
                if current_rule and buffer:
                    grouped.append(create_rule_document(buffer, current_rule, current_metadata))
                current_rule = match.group(1)
                # Normalize rule format
                current_rule = current_rule.replace(' ', '')  # Remove spaces in S.R./G.R.
                buffer = [text]
                current_metadata = doc.metadata.copy()
            else:
                buffer.append(text)

        if current_rule and buffer:
            grouped.append(create_rule_document(buffer, current_rule, current_metadata))

        return grouped

    def create_rule_document(content: List[str], rule: str, metadata: dict) -> Document:
        """Create a rule document with proper metadata."""
        rule_type = "SR" if rule.startswith('S.R.') else \
                   "GR" if rule.startswith('G.R.') else "Standard"
        
        metadata = metadata.copy()
        metadata.update({
            "rule": rule,
            "rule_type": rule_type,
            "source": metadata.get("source", "KRCL G & SR 2020.pdf"),
            "document_type": "legal_rule"
        })
        return Document(
            page_content="\n".join(content),
            metadata=metadata
        )

    # Step 2: Group the documents by rule
    grouped_docs = group_by_rule(documents)

    # Step 3: Merge small rules intelligently
    def merge_small_rules(docs: List[Document], min_length: int = 100) -> List[Document]:
        merged = []
        previous_doc = None

        for i, doc in enumerate(docs):
            # Handle first document case
            if i == 0 and len(doc.page_content) < min_length:
                if len(docs) > 1 and docs[i+1].metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    # Merge forward with next document
                    docs[i+1].page_content = doc.page_content + "\n\n" + docs[i+1].page_content
                    docs[i+1].metadata["combined_rules"] = (
                        doc.metadata["rule"] + "; " + 
                        docs[i+1].metadata.get("combined_rules", docs[i+1].metadata["rule"])
                    )
                    continue
                merged.append(doc)
                continue

            if previous_doc and len(doc.page_content) < min_length:
                # Only merge if same rule type
                if previous_doc.metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    previous_doc.page_content += "\n\n" + doc.page_content
                    previous_doc.metadata["combined_rules"] = (
                        previous_doc.metadata.get("combined_rules", previous_doc.metadata["rule"]) + 
                        f"; {doc.metadata['rule']}"
                    )
                    continue
                else:
                    merged.append(previous_doc)
                    previous_doc = doc
                    continue
            else:
                if previous_doc:
                    merged.append(previous_doc)
                previous_doc = doc

        if previous_doc:
            merged.append(previous_doc)

        return merged

    merged_docs = merge_small_rules(grouped_docs)

    # Step 4: Split long rules with metadata preservation and reference extraction
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        length_function=len,
        keep_separator=True
    )

    final_documents = []
    for doc in merged_docs:
        try:
            splits = splitter.split_documents([doc])
            
            # Extract references from each split
            reference_pattern = re.compile(
                r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
            )
            
            for chunk_idx, split in enumerate(splits, 1):
                # Extract references
                references = reference_pattern.findall(split.page_content)
                if references:
                    split.metadata["references"] = [f"{sr} of {gr}" for sr, gr in references]
                
                # Merge metadata with priority to split-specific values
                split.metadata = {
                    **doc.metadata,
                    **split.metadata,  # Preserves extracted references
                    "chunk_id": f"{doc.metadata['rule']}_chunk_{chunk_idx}",
                    "total_chunks": len(splits),
                    "chunk_number": chunk_idx
                }
            
            final_documents.extend(splits)
        except Exception as e:
            print(f"Error splitting rule {doc.metadata['rule']}: {str(e)}")
            doc.metadata.update({
                "chunk_id": f"{doc.metadata['rule']}_chunk_1",
                "total_chunks": 1,
                "chunk_number": 1,
                "references": extract_references(doc.page_content)  # Extract even if not split
            })
            final_documents.append(doc)

    return final_documents

def extract_references(text: str) -> List[str]:
    """Extract cross-references from text."""
    reference_pattern = re.compile(
        r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
    )
    references = reference_pattern.findall(text)
    return [f"{sr} of {gr}" for sr, gr in references]

# Process documents
documents = process_legal_documents(docs)


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")


In [9]:
db = FAISS.from_documents(documents, embeddings)

In [10]:
db.save_local("faiss_index_krcl")

### Loading the database later

In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings  # or whichever embeddings you used

# Initialize your embeddings (must match what you used to create the index)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # example

# Load the FAISS index with safety override
db = FAISS.load_local(
    "faiss_index_krcl",
    embeddings,
    allow_dangerous_deserialization=True  # Required for loading pickle files
)

In [11]:
query = "what is LIMIT OF SPEED WITH ENGINE TENDER FOREMOST"
retireved_results=db.similarity_search(query)
print(retireved_results[0].page_content)

4.13
LIMIT OF SPEED WITH ENGINE TENDER FOREMOST:-
(1)
(a) A passenger train or a mixed train shall not be drawn outside station limit by a steam engine running tender foremost, except-
(I)
under a written order issued by the authorized officer:- or
(II)
in a case of unavoidable necessity, to be established by the Loco Pilot.
(b) When any such train is so drawn, the speed shall not exceed 25 kilometers and hour, or such higher speed, not exceeding 40 kilometers an hour, as may be authorised by approved special instructions.
(2)
in case of unavoidable necessity, goods trains may run with steam engins tender foremost at a speed not exceeding 25 kilometers an hour or such higher speed, which shall, In no Circumstances, exceed 40 kilometres an hour, as may be laid down by special instructions.


## Models Using

In [14]:
from langchain_community.llms import Ollama

llm = Ollama(model="mistral")
llm

  llm = Ollama(model="mistral")


Ollama(model='mistral')

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
You are an expert assistant answering questions about railway operations and safety rules from the *KRCL General and Subsidiary Rules 2020* manual.

Your task is to provide accurate, section-based answers **only** from the provided context. Do not guess. Do not invent rules. If the answer cannot be found in the context, clearly state: "The information is not available in the provided rules."

Warning: Your response may be used by railway staff in real operational scenarios. Wrong or misleading answers may lead to **accidents or disciplinary action**. Respond carefully and precisely.

Instructions:
- Identify the correct rule number and section from the context.
- Provide the **exact wording** or an accurate summary.
- Format the rule reference like this: *Rule 2.03 — Knowledge of Rules*.
- If multiple rules apply, cite each.
- If the answer is not in the context, **say so clearly**.

I'll tip you $1000 dollars everytime you give correct answer.

<context>
{context}
</context>

Question: {input}

Answer (with rule reference):
""")


In [15]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)

In [16]:
"""
Retrievers: A retriever is an interface that returns documents given
 an unstructured query. It is more general than a vector store.
 A retriever does not need to be able to store documents, only to 
 return (or retrieve) them. Vector stores can be used as the backbone
 of a retriever, but there are other types of retrievers as well. 
 https://python.langchain.com/docs/modules/data_connection/retrievers/   
"""

retriever=db.as_retriever()
retriever


VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000020C3FBF19D0>, search_kwargs={})

In [17]:
"""
Retrieval chain:This chain takes in a user inquiry, which is then
passed to the retriever to fetch relevant documents. Those documents 
(and original inputs) are then passed to an LLM to generate a response
https://python.langchain.com/docs/modules/chains/
"""
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [None]:
response = retrieval_chain.invoke({"input": "HEAD LIGHT, MARKER LIGHTS AND SPEEDOMETER"})


In [20]:
response['answer']

" *Rule 4.14(1) — A train shall not be worked at night or in thick, foggy or tempestous weather impairing visibility or in long tunnels, unless the engine carries an electric head light of an approved design and, in addition, two oil or electric white marker lights.*\n*Rule 4.14(3) — The electric head light on the engine shall be fitted with a switch to dim the light and shall be dimmed when the train remains stationary at a station.*\n*Rule A.6 from SR 4.65(3) — A Track Maintenance Machine must be equipped with prescribed head light and tail light, marker light and flasher light as per GR GR4.14 to 4.16 and SR's thereto.*"