In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader("KRCL G & SR 2020.pdf", mode="elements")
docs = loader.load()

In [None]:
docs

In [None]:
doc

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re
from typing import List, Optional, Tuple

def process_legal_documents(documents: List[Document]) -> List[Document]:
    """Process loaded PDF documents into structured legal chunks with reference extraction."""
    
    # Step 1: Group by rule number with enhanced pattern matching
    def group_by_rule(docs: List[Document]) -> List[Document]:
        # Pattern matches all rule types:
        # - S.R.2.24, G.R.3.15
        # - Standard rules: 2.03, 3.01
        # - Rule 4.05, Article 5.02, §6.01
        rule_pattern = re.compile(
            r"^(?:(?:S\.R\.|G\.R\.|Rule|Article|Section|§)?\s*)?"
            r"((?:[SG]\.R\.)?\d{1,2}\.\d{2}(?:\.\d+)?|\d{1,2}\-\d{2})", 
            re.IGNORECASE
        )
        grouped = []
        current_rule = None
        current_metadata = {}
        buffer = []

        for doc in docs:
            text = doc.page_content.strip()
            if not text:
                continue

            if not current_metadata:
                current_metadata = doc.metadata.copy()

            match = rule_pattern.match(text)
            if match:
                if current_rule and buffer:
                    grouped.append(create_rule_document(buffer, current_rule, current_metadata))
                current_rule = match.group(1)
                # Normalize rule format
                current_rule = current_rule.replace(' ', '')  # Remove spaces in S.R./G.R.
                buffer = [text]
                current_metadata = doc.metadata.copy()
            else:
                buffer.append(text)

        if current_rule and buffer:
            grouped.append(create_rule_document(buffer, current_rule, current_metadata))

        return grouped

    def create_rule_document(content: List[str], rule: str, metadata: dict) -> Document:
        """Create a rule document with proper metadata."""
        rule_type = "SR" if rule.startswith('S.R.') else \
                   "GR" if rule.startswith('G.R.') else "Standard"
        
        metadata = metadata.copy()
        metadata.update({
            "rule": rule,
            "rule_type": rule_type,
            "source": metadata.get("source", "KRCL G & SR 2020.pdf"),
            "document_type": "legal_rule"
        })
        return Document(
            page_content="\n".join(content),
            metadata=metadata
        )

    # Step 2: Group the documents by rule
    grouped_docs = group_by_rule(documents)

    # Step 3: Merge small rules intelligently
    def merge_small_rules(docs: List[Document], min_length: int = 100) -> List[Document]:
        merged = []
        previous_doc = None

        for i, doc in enumerate(docs):
            # Handle first document case
            if i == 0 and len(doc.page_content) < min_length:
                if len(docs) > 1 and docs[i+1].metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    # Merge forward with next document
                    docs[i+1].page_content = doc.page_content + "\n\n" + docs[i+1].page_content
                    docs[i+1].metadata["combined_rules"] = (
                        doc.metadata["rule"] + "; " + 
                        docs[i+1].metadata.get("combined_rules", docs[i+1].metadata["rule"])
                    )
                    continue
                merged.append(doc)
                continue

            if previous_doc and len(doc.page_content) < min_length:
                # Only merge if same rule type
                if previous_doc.metadata.get('rule_type') == doc.metadata.get('rule_type'):
                    previous_doc.page_content += "\n\n" + doc.page_content
                    previous_doc.metadata["combined_rules"] = (
                        previous_doc.metadata.get("combined_rules", previous_doc.metadata["rule"]) + 
                        f"; {doc.metadata['rule']}"
                    )
                    continue
                else:
                    merged.append(previous_doc)
                    previous_doc = doc
                    continue
            else:
                if previous_doc:
                    merged.append(previous_doc)
                previous_doc = doc

        if previous_doc:
            merged.append(previous_doc)

        return merged

    merged_docs = merge_small_rules(grouped_docs)

    # Step 4: Split long rules with metadata preservation and reference extraction
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        length_function=len,
        keep_separator=True
    )

    final_documents = []
    for doc in merged_docs:
        try:
            splits = splitter.split_documents([doc])
            
            # Extract references from each split
            reference_pattern = re.compile(
                r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
            )
            
            for chunk_idx, split in enumerate(splits, 1):
                # Extract references
                references = reference_pattern.findall(split.page_content)
                if references:
                    split.metadata["references"] = [f"{sr} of {gr}" for sr, gr in references]
                
                # Merge metadata with priority to split-specific values
                split.metadata = {
                    **doc.metadata,
                    **split.metadata,  # Preserves extracted references
                    "chunk_id": f"{doc.metadata['rule']}_chunk_{chunk_idx}",
                    "total_chunks": len(splits),
                    "chunk_number": chunk_idx
                }
            
            final_documents.extend(splits)
        except Exception as e:
            print(f"Error splitting rule {doc.metadata['rule']}: {str(e)}")
            doc.metadata.update({
                "chunk_id": f"{doc.metadata['rule']}_chunk_1",
                "total_chunks": 1,
                "chunk_number": 1,
                "references": extract_references(doc.page_content)  # Extract even if not split
            })
            final_documents.append(doc)

    return final_documents

def extract_references(text: str) -> List[str]:
    """Extract cross-references from text."""
    reference_pattern = re.compile(
        r"\b(S\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\s+of\s+(G\.R\.\d{1,2}\.\d{2}(?:\.\d+)?)\b"
    )
    references = reference_pattern.findall(text)
    return [f"{sr} of {gr}" for sr, gr in references]

# Process documents
documents = process_legal_documents(docs)

# Display results
print(f"\n✅ Successfully processed {len(processed_docs)} legal document chunks\n")
print("Sample chunks with references:")
for i, doc in enumerate(processed_docs[:5]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Rule: {doc.metadata.get('rule')} ({doc.metadata.get('rule_type')})")
    print(f"Chunk: {doc.metadata.get('chunk_number')}/{doc.metadata.get('total_chunks')}")
    if 'combined_rules' in doc.metadata:
        print(f"Combined with: {doc.metadata['combined_rules']}")
    if 'references' in doc.metadata:
        print(f"References: {', '.join(doc.metadata['references'])}")
    content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
    print(f"\n{content_preview}\n")
    print("-" * 50)

In [None]:
documents

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")


In [None]:
db = FAISS.from_documents(documents, embeddings)

In [None]:
query = "Rules of Reporting S & T Gear failures"
retireved_results=db.similarity_search(query)
print(retireved_results[0].page_content)

## Accidental


In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader("Accident Manaul - 2021_1.pdf", mode="elements")
docs = loader.load()

In [2]:
import re
from typing import List, Optional
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load PDF
loader = UnstructuredPDFLoader("Accident Manaul - 2021_1.pdf", mode="elements")
documents = loader.load()

# Regex patterns
chapter_pattern = re.compile(r"^CHAPTER\s*[-–]?\s*\d+\s*(.*)", re.IGNORECASE)
rule_pattern = re.compile(r"^(\d{3})\.\s+(.+)$")

def create_rule_document(content: List[str], rule: str, title: str, chapter: Optional[str], metadata: dict) -> Document:
    """Creates a Document from rule section."""
    metadata = metadata.copy()
    metadata.update({
        "rule": rule,
        "rule_title": title,
        "chapter": chapter or "Unknown",
        "document_type": "accident_manual_rule",
        "source": metadata.get("source", "Accident Manual 2021")
    })
    return Document(page_content="\n".join(content), metadata=metadata)

def split_and_merge_chunks(documents: List[Document]) -> List[Document]:
    """Handles merging small chunks and splitting large ones."""
    final_chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
    pending_small = None

    for doc in documents:
        content = doc.page_content.strip()

        if len(content) < 100:
            if pending_small:
                # Merge small chunks
                pending_small.page_content += "\n\n" + content
                pending_small.metadata["rule"] += f"; {doc.metadata['rule']}"
                pending_small.metadata["rule_title"] += f"; {doc.metadata['rule_title']}"
            else:
                pending_small = doc
            continue

        elif len(content) > 1000:
            # Split large chunk
            splits = splitter.split_documents([doc])
            for i, split in enumerate(splits, 1):
                split.metadata.update({
                    "chunk_id": f"{doc.metadata['rule']}_chunk_{i}",
                    "chunk_number": i,
                    "total_chunks": len(splits)
                })
            final_chunks.extend(splits)
        else:
            if pending_small:
                final_chunks.append(pending_small)
                pending_small = None
            doc.metadata.update({
                "chunk_id": f"{doc.metadata['rule']}_chunk_1",
                "chunk_number": 1,
                "total_chunks": 1
            })
            final_chunks.append(doc)

    if pending_small:
        final_chunks.append(pending_small)

    return final_chunks

def process_documents(documents: List[Document]) -> List[Document]:
    """Extract rules and chapters, then chunk intelligently."""
    current_chapter = None
    current_metadata = {}
    current_rule = None
    current_title = None
    buffer = []
    grouped_docs = []

    for doc in documents:
        lines = doc.page_content.splitlines()
        for line in lines:
            line = line.strip()
            if not line:
                continue

            if chapter_match := chapter_pattern.match(line):
                current_chapter = chapter_match.group(1).strip().title()
                continue

            if rule_match := rule_pattern.match(line):
                if buffer:
                    grouped_docs.append(create_rule_document(
                        buffer, current_rule, current_title, current_chapter, current_metadata))
                current_rule = rule_match.group(1)
                current_title = rule_match.group(2).strip()
                current_metadata = doc.metadata.copy()
                buffer = [line]
            else:
                buffer.append(line)

    if buffer:
        grouped_docs.append(create_rule_document(
            buffer, current_rule, current_title, current_chapter, current_metadata))

    return split_and_merge_chunks(grouped_docs)

def preview_chunks(processed_docs: List[Document], n=5):
    for i, doc in enumerate(processed_docs[:n]):
        print(f"\n--- Chunk {i+1} ---")
        print(f"Rule: {doc.metadata.get('rule')} - {doc.metadata.get('rule_title')}")
        print(f"Chapter: {doc.metadata.get('chapter')}")
        print(f"Chunk: {doc.metadata.get('chunk_number')}/{doc.metadata.get('total_chunks')}")
        print(doc.page_content[:300] + ("..." if len(doc.page_content) > 300 else ""))
        print("-" * 50)

# === Main Processing ===
processed_docs = process_documents(documents)
preview_chunks(processed_docs)



--- Chunk 1 ---
Rule: None - None
Chapter: Classification Of Accidents
Chunk: 1/33
KONKAN RAILWAY CORPORATION LIMITED
ACCIDENT MANUAL
APRIL 2021
SAFETY ORGANISATION
FOREWORD
The Accident Manual is a compendium of all instructions, rules and regulations and guidelines issued from time to time on the subject of Railway Accidents.
This New Accident Manual is brought-out after reviewi...
--------------------------------------------------

--- Chunk 2 ---
Rule: None - None
Chapter: Classification Of Accidents
Chunk: 2/33
Accident Manual should be gone through by all the Railway officials, staff who are required to deal with train operations directly or indirectly and those who have to maintain the Railway Assets. All Railway officials should be fully aware and conversant with the provisions of the Accident Manual, G...
--------------------------------------------------

--- Chunk 3 ---
Rule: None - None
Chapter: Classification Of Accidents
Chunk: 3/33
To Railwaymen
This Manual brings toget

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")


In [4]:
db = FAISS.from_documents(documents, embeddings)
db.save_local("faiss_manual_krcl")