In [None]:
print("Process begin")

In [None]:
%%capture
!pip install langchain_huggingface langchain_core langchain_community langchain_text_splitters langchain_experimental langchain_core

In [None]:
import os
import json
import torch
from typing import List
from pydantic import BaseModel

from langchain_huggingface import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline


In [None]:
# !huggingface-cli login


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:

# model_name = "Qwen/Qwen2.5-3B-Instruct"
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)


In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,  # Increased for complete responses
    temperature=0.0,
    do_sample=False,
    return_full_text=False,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
pdf_path = "/kaggle/input/legal-pdf/constitution_nowater.pdf"

In [None]:

# Ensure the PDF file exists for the loader to work
if not os.path.exists(pdf_path):
    print(f"Error: The file '{pdf_path}' was not found. Please upload your PDF file or provide a correct path.")
else:
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        print(f"Successfully loaded {len(documents)} pages from {pdf_path}")


    except Exception as e:
        print(f"An error occurred while loading the PDF: {e}")


In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

text_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=95
)

semantic_docs = text_splitter.split_documents(documents)


In [None]:
len(semantic_docs)

In [None]:
from langchain_core.documents import Document

def merge_small_chunks(docs, min_chars=1200):
    merged_docs = []
    buffer = ""

    for doc in docs:
        text = doc.page_content.strip()

        if len(buffer) + len(text) < min_chars:
            buffer += "\n\n" + text
        else:
            merged_docs.append(
                Document(page_content=buffer.strip(), metadata=doc.metadata)
            )
            buffer = text

    if buffer:
        merged_docs.append(
            Document(page_content=buffer.strip(), metadata=docs[-1].metadata)
        )

    return merged_docs


In [None]:
merged_docs = merge_small_chunks(semantic_docs, min_chars=1200)

In [None]:
len(merged_docs)

In [None]:

final_splitter = RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=120,
    separators=["\n\n", "\n", ". ", "? ", "! ", " "]
)

docs = final_splitter.split_documents(merged_docs)


In [None]:
len(docs)

In [None]:
print(docs[2].page_content)

In [None]:
class QAPair(BaseModel):
    question: str
    answer: str


class QAPairs(BaseModel):
    qna_pairs: List[QAPair]


In [None]:
parser = PydanticOutputParser(pydantic_object=QAPairs)


In [None]:
format_instructions = parser.get_format_instructions()

In [None]:
# final_prompt = prompt.invoke({"chunk": docs[0].page_content,
#                               "format_instructions": format_instructions})

In [None]:
prompt = PromptTemplate(
    template="""You are a data extraction system.

Generate minimium 5 or more than 5  question‚Äìanswer pairs.

CRITICAL RULES:
- Answers MUST copy wording directly from the text
- Do NOT paraphrase
- Do NOT summarize
- Use full sentences from the text
- Output ONLY valid JSON
- No markdown
- No explanations

TEXT:
{chunk}

JSON FORMAT:
{format_instructions}
""",
    input_variables=["chunk"],
    partial_variables={"format_instructions": format_instructions},
)


In [None]:
final_prompt = prompt.invoke({"chunk": docs[0].page_content})

In [None]:
print(final_prompt)

In [None]:
# IMPORTANT: final_prompt is a PromptValue
prompt_text = final_prompt.to_string()

In [None]:
response = llm.invoke(prompt_text)

In [None]:
print(response)

In [None]:
import json
import re

def extract_json_from_text(text: str) -> dict:
    """
    Extracts the first valid JSON object from model output.
    Handles markdown blocks and raw JSON.
    """
    # Remove markdown ``` blocks
    text = re.sub(r"```(?:json)?", "", text)
    text = text.replace("```", "").strip()

    # Find first JSON object
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError("No JSON object found")

    json_str = match.group(0)

    return json.loads(json_str)


In [None]:
res = extract_json_from_text(response)

In [None]:
print(res)

In [None]:
len(docs)

In [None]:
START_INDEX = 0  # resume from chunk 18

In [None]:
import json
import time

MAX_RETRIES = 3

with open("QAPairs.jsonl", "a", encoding="utf-8") as f:
    for i in range(START_INDEX, len(docs)):  # change 5 ‚Üí len(docs) later

        success = False

        for attempt in range(1, MAX_RETRIES + 1):
            print(f"üîÅ Chunk {i} | Attempt {attempt}/{MAX_RETRIES}")

            # 1. Build prompt
            final_prompt = prompt.invoke({
                "chunk": docs[i].page_content
            })

            prompt_text = final_prompt.to_string()

            # 2. Call LLM
            raw_output = llm.invoke(prompt_text)

            # 3. Extract JSON safely
            try:
                parsed_json = extract_json_from_text(raw_output)
            except Exception as e:
                print(f"‚ùå JSON extraction failed at chunk {i}, attempt {attempt}: {e}")
                time.sleep(1)
                continue

            # 4. Validate schema manually
            if "qna_pairs" not in parsed_json or not isinstance(parsed_json["qna_pairs"], list):
                print(f"‚ùå Invalid schema at chunk {i}, attempt {attempt}")
                time.sleep(1)
                continue

            # 5. Write JSONL
            for qa in parsed_json["qna_pairs"]:
                record = {
                    "messages": [
                        {"role": "user", "content": qa["question"].strip()},
                        {"role": "assistant", "content": qa["answer"].strip()}
                    ]
                }
                f.write(json.dumps(record, ensure_ascii=False) + "\n")

            print(f"‚úÖ Successfully processed chunk {i}")
            success = True
            break  # stop retrying for this chunk

        if not success:
            print(f"üö® Skipping chunk {i} after {MAX_RETRIES} failed attempts")
