In [1]:
from langchain_community.document_loaders import PyPDFLoader
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import List
import os
from dotenv import load_dotenv
import textwrap
import npttf2utf
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

In [None]:
from pydantic import BaseModel, Field
from typing import List, Literal

class Chunk(BaseModel):
    chunk_id: int = Field(description="Sequential identifier (1, 2, 3...)")
    document_type: Literal["citizenship"] = Field(description="Type of document, e.g., 'citizenship'")
    section: str = Field(description="दफा नम्बर (e.g., '3', '4')")
    subsection: List[str] = Field(
        default_factory=list,
        description="List of उपदफा नम्बर; empty list if not present"
    )
    tag: List[Literal[
        "eligibility & requirements",
        "procedure",
        "recommendation",
        "legal",
        "special case",
        "correction & modification"
    ]] = Field(description="List of 2 tags at maximum best describing the content")
    references: List[str] = Field(
        default_factory=list,
        description="List of mentioned Dafa or Upadafa numbers; empty if none"
    )
    source_type:str = Field(description= "Type of source for this chunk")
    source_link:str = Field(description = "Source link")
    content: str = Field(description="Full Nepali Unicode text of the chunk")

class ChunkList(BaseModel):
    chunks: List[Chunk] = Field(description="List of all structured chunks extracted from the document")


In [4]:
# Parser setup
parser = JsonOutputParser(pydantic_object=ChunkList)

In [5]:
def load_pdf(path):
    pdf_path = rf"{path}"
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    text = "\n\n".join([doc.page_content for doc in docs])
    
    # Convert Preeti → Unicode
    mapper = npttf2utf.FontMapper("map.json")
    output_text = mapper.map_to_unicode(
        text,
        from_font="Preeti",
        unescape_html_input=False,
        escape_html_output=False
    )
    return output_text

In [12]:
prompt = PromptTemplate(
    template=textwrap.dedent("""
    You are an expert legal document analyzer specialized in Nepali legal and administrative documents. 
    Your task is to extract, chunk, and structure information from the Nepal Citizenship Act document 
    into a JSON-ready schema suitable for vector database insertion.

    1. CHUNK CREATION:
    - If a dafa (section) is shorter than 150–200 words → create a single chunk for the entire dafa.
    - Group multiple उपदफा together into a single chunk if their combined length does not exceed 150 words.
    - NEVER break within an upadafa — always complete the current upadafa before creating a new chunk.
                             
    2. SCHEMA FIELDS:
    Each extracted chunk must include the following fields strictly following the instruction:
    - chunk_id: Sequential integer identifier (1, 2, 3…)
    - document_type: "citizenship" (for this document)
    - section: Dafa number (e.g., "3", "4", "5")
    - subsection: subsection: List of Upadafa numbers of the content (e.g., "1", "2"); use [] if none
    - tag: Assign maximum of two tags from the list below based on content meaning:
        * "eligibility & requirements" (योग्यता र आवश्यक कागजातहरू): Age, relationship, residence, or document requirements
        * "procedure" (प्रक्रिया): Steps for obtaining citizenship, application process, or office procedures
        * "recommendation" (सिफारिस/मुचुल्का): Recommendations, certificates, or identification from ward or authority
        * "legal" (कानुनी व्यवस्था र दण्ड): Legal provisions, rules, penalties, or implementation clauses
        * "special case" (विशेष अवस्था): Exceptional or unusual citizenship circumstances
        * "correction & modification" (संशोधन र सुधार): Name, surname, or birth-date corrections and related modifications
    - references: List any mentioned or referenced Dafa or Upadafa numbers, e.g. ["2(3)", "5(1)"]; use [] if none.
    - source_type:"Citizenship_Act"
    - source_link:"https://moha.gov.np/en/post/citizenship-act-2063"
    - content: The full Nepali text of the chunk (do not summarize or translate) or remove anything.

    4. DOCUMENT STRUCTURE UNDERSTANDING:
    - The document follows Dafa (section) and Upadafa (subsection) numbering (e.g., “दफा 5(1)”).
    - Identify all dafa and upadafa correctly.
    - Track cross-references between different Dafas (e.g., “दफा 2(3) अनुसार”) or implicit reference and include them in the `references` field.
    - There may be improper uncodes in prompt so fix that too when generating response.

    DOCUMENT TEXT:
    {context}

    {format_instructions}

    Return ONLY valid JSON following the exact schema above.
    """),
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

prompt_multi = PromptTemplate(
    template=textwrap.dedent("""
    You are an expert legal document analyzer specialized in Nepali legal and administrative documents. 
    Your task is to extract, chunk, and structure information from the Nepal Citizenship Act document 
    into a JSON-ready schema suitable for vector database insertion.

    1. CHUNK CREATION:
    - If a dafa (section) is shorter than 100 words → create a single chunk for the entire dafa.
    - Group multiple उपदफा together into a single chunk if their combined length does not exceed 100 words.
    - NEVER break within an upadafa — always complete the current upadafa before creating a new chunk.
                             
    2. SCHEMA FIELDS:
    Each extracted chunk must include the following fields strictly following the instruction:
    - chunk_id: Sequential integer identifier (1, 2, 3…)
    - document_type: "citizenship" (for this document)
    - section: Dafa number (e.g., "3", "4", "5")
    - subsection: subsection: List of Upadafa numbers of the content (e.g., "1", "2"); use [] if none
    - tag: Assign maximum of two tags from the list below based on content meaning:
        * "eligibility & requirements" (योग्यता र आवश्यक कागजातहरू): Age, relationship, residence, or document requirements
        * "procedure" (प्रक्रिया): Steps for obtaining citizenship, application process, or office procedures
        * "recommendation" (सिफारिस/मुचुल्का): Recommendations, certificates, or identification from ward or authority
        * "legal" (कानुनी व्यवस्था र दण्ड): Legal provisions, rules, penalties, or implementation clauses
        * "special case" (विशेष अवस्था): Exceptional or unusual citizenship circumstances
        * "correction & modification" (संशोधन र सुधार): Name, surname, or birth-date corrections and related modifications
    - references: List any mentioned or referenced Dafa or Upadafa numbers, e.g. ["2(3)", "5(1)"]; use [] if none.
    - source_type:"Citizenship_Act"
    - source_link:"https://moha.gov.np/en/post/citizenship-act-2063"
    - content: The full Nepali text of the chunk (do not summarize or translate) or remove anything.

    4. DOCUMENT STRUCTURE UNDERSTANDING:
    - The document follows Dafa (section) and Upadafa (subsection) numbering (e.g., “दफा 5(1)”).
    - "क","ख","ग" these are not subsections. Subsections and sections are numerals.
    - Identify all dafa and upadafa correctly.
    - Track cross-references between different Dafas (e.g., “दफा 2(3) अनुसार”) or implicit reference and include them in the `references` field.
    - Also track reference of upadafas within same dafa (e.g. उपदफा (१) बमोजिम )
    - There may be improper uncodes in prompt so fix that too when generating response.

    DOCUMENT TEXT:
    {context}

    {format_instructions}

    Return ONLY valid JSON following the exact schema above.
    """),
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [None]:
prompt2 = PromptTemplate(
    template=textwrap.dedent("""
    You are an expert legal document analyzer specialized in Nepali legal and administrative documents. 
    Your task is to extract, chunk, and structure information from the Nepal Citizenship Requirements PDF
    into a JSON-ready schema suitable for vector database insertion.

    1. a)Each chunk must correspond to **one specific citizenship type or situation**, such as:
       - वंशजको नागरिकता (अविवाहित)
       - अंगीकृत नागरिकता (वैवाहिक)
       - प्रतिलिपि नागरिकता
       - कर्मचारी परिवारको आधारमा
       - पतिको नाम समावेश/हटाउने
       - सम्बन्ध विच्छेद/दोस्रो विवाह
                             
      b) tag: Assign maximum of two tags from the list below based on content meaning:
        * "eligibility & requirements" (योग्यता र आवश्यक कागजातहरू): Age, relationship, residence, or document requirements
        * "procedure" (प्रक्रिया): Steps for obtaining citizenship, application process, or office procedures
        * "recommendation" (सिफारिस/मुचुल्का): Recommendations, certificates, or identification from ward or authority
        * "legal" (कानुनी व्यवस्था र दण्ड): Legal provisions, rules, penalties, or implementation clauses
        * "special case" (विशेष अवस्था): Exceptional or unusual citizenship circumstances
        * "correction & modification" (संशोधन र सुधार): Name, surname, or birth-date corrections and related modifications
      c) references: List any mentioned or referenced सि.नं (write in english numeral)
      d) section: section means the सि.नं (write in english numeral)
      For each chunk :source_type:"Citizenship_Prcoess" and source_link:"https://daodarchula.moha.gov.np/post/documents-required-for-citizenship-new"
                             
    2. For each such chunk:
       - Use `"content"` to include:
         - A contextual summary starting with a sentence like:
           **"यदि कुनै व्यक्ति अविवाहित छन् र वंशजको आधारमा नेपाली नागरिकता प्राप्त गर्न चाहन्छन् भने, निम्न प्रक्रिया र कागजातहरू आवश्यक पर्छन्।"**
         - Followed by the required documents and procedures in bullet or numbered format.
                             
    3. If the document contains **portions that do NOT specify any citizenship type or situation**, then:
       - Split semantically into chunks of approximately **150 words**
       - Each chunk must begin with a **brief contextual summary in Nepali** describing what the section is about (e.g., टिकटको व्यवस्था, वसाई सराईको ध्यान दिनुपर्ने कुरा, कर्मचारी परिवारको सिफारिश प्रक्रिया)
       - Do NOT split inside a bullet point or sentence; preserve semantic boundaries

    4. The `"content"` must be written in clear, formal Nepali and be semantically complete — it should answer standalone queries like:
       - "के कागजात चाहिन्छ?"
       - "कसको सिफारिश चाहिन्छ?"
       - "कुन अवस्थामा यो प्रक्रिया लागू हुन्छ?"

    5. Do NOT merge multiple citizenship types into one chunk. Keep each chunk focused and self-contained.

    DOCUMENT TEXT:
    {context}

    {format_instructions}

    Return ONLY valid JSON following the exact schema above.
    """),
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)



In [49]:
prompt3 = PromptTemplate(
    template=textwrap.dedent("""
    You are an expert legal document analyzer specialized in Nepali legal and administrative documents. 
    Your task is to extract, chunk, and structure information from the Nepal Citizenship FAQs PDF
    into a JSON-ready schema suitable for vector database insertion.
                             
      a) tag: Assign maximum of two tags from the list below based on content meaning:
        * "eligibility & requirements" (योग्यता र आवश्यक कागजातहरू): Age, relationship, residence, or document requirements
        * "procedure" (प्रक्रिया): Steps for obtaining citizenship, application process, or office procedures
        * "recommendation" (सिफारिस/मुचुल्का): Recommendations, certificates, or identification from ward or authority
        * "legal" (कानुनी व्यवस्था र दण्ड): Legal provisions, rules, penalties, or implementation clauses
        * "special case" (विशेष अवस्था): Exceptional or unusual citizenship circumstances
        * "correction & modification" (संशोधन र सुधार): Name, surname, or birth-date corrections and related modifications
      c) references: No references for this document
      d) section: No section names                                      
      e) Split semantically into chunks with each question and answer (one chunk equals one question and answer)
                             
      f) For each chunk include source_type:"Citizenship_Faqs" and source_link:"https://www.moha.gov.np/en/page/citizenship-10" in schema

    g)The `"content"` must be written in clear, formal Nepali and be semantically complete — it should answer standalone queries. Write exact content given wthout any summarization or modification(only modified worng unicodes)

    DOCUMENT TEXT:
    {context}

    {format_instructions}

    Return ONLY valid JSON following the exact schema above.
    """),
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)



In [13]:
def main():
    full_text = load_pdf("../../data/raw/citizenship/citizenship_act_nepali.pdf")
    # full_text = load_pdf("../../data/raw/citizenship/citizenship_documents.pdf")
    # full_text = load_pdf("../../data/raw/citizenship/citizenship_faqs.pdf")
    print(f"Loaded text length: {len(full_text)} characters")
    print(f"First 500 characters:\n{full_text[:500]}")
    print("\n" + "="*60 + "\n")

    try:
        chain = prompt_multi | llm | parser
        response = chain.invoke({"context": full_text})

        print("✅ Successfully extracted structured chunks.")
        print(f"Total chunks found: {len(response.get('chunks', []))}")

        # Save to file
        filename = "data/citizenship_act_chunks_multi.json"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(response, f, ensure_ascii=False, indent=4)

        print(f"\n✅ Saved {len(response.get('chunks', []))} chunks to {filename}")

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()

Loaded text length: 14984 characters
First 500 characters:
नेपाल नागरिकता ऐन, २०६३ 
 
 प ्रमाणीकरण र प्रकाशन मिति 
 २०६३।८।१० 
संशोधन गर्ने ऐन
 
केही नेपाल ऐनलाई संशोधन गर्ने ऐन, २०६४ २०६४।५।९ 
नेपाल नागरिकता (पहिलो संशोधन) अध्यादेश, २०६९ २०६९।१२।२९ 
 
२०६३ सालको ऐन नं. २५ 
नागरिकता सम्बन्धी नेपाल कानूनलाई संशोधन र एकीकरण गर्न बनेको ऐन 
प्रस्तावना : ए ेतिहासिक जनआन्दोलनको परिणाम स्वरुप न ेपालको सार्वभौमसत्ता नेपाली जनतामा 
निहित रहेको र राज्यशक्तिका े स्रोत नेपाली जनता न ै रहेको विद्यमान अवस्थामा विगतमा न ेपाली 
नागरिकहरुले नागरिकताको प्रमाणपत्र प्राप्त


✅ Successfully extracted structured chunks.
Total chunks found: 53

✅ Saved 53 chunks to data/citizenship_act_chunks_multi.json
