In [None]:
pip install faiss-cpu

In [None]:
pip install -U langchain-community

In [None]:
!pip install pytesseract

In [None]:
pip install pdf2image

In [None]:
!apt-get install -y poppler-utils

In [None]:
!apt-get install -y tesseract-ocr-ben

In [None]:
!ls /usr/share/tesseract-ocr/4.00/tessdata/

In [None]:
# Clean up: remove old/broken fitz traces if any
!rm -rf /usr/local/lib/python3.11/dist-packages/fitz*

# Force reinstall PyMuPDF
!pip install --force-reinstall PyMuPDF


In [None]:
pip install pdfplumber

In [None]:
pip  install langdetect

In [None]:
from langdetect import detect, DetectorFactory
import fitz  # PyMuPDF
import cv2
import pytesseract
import numpy as np
import pdfplumber
import re
import string
import json
import re
import os
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import faiss
from typing import List, Dict, Optional

In [None]:
pdf_file = '/content/HSC26-Bangla1st-Paper.pdf'

In [None]:

def fix_roman_numerals(lines):
    """
    Fix OCR artifacts for Roman numerals.
    - Only replace double/triple danda (।।, ।।।) and bars (||).
    - Never convert a single danda to 'i'.
    - Remove '।' after ক/খ/গ/ঘ (option markers).
    - Fix split Bangla characters.
    """
    replacements = {
        "।।।": "iii",
        "।।": "ii",
        "||": "ii",
        "|": "i"
    }

    fixed_lines = []
    for line in lines:
        fixed = line

        # Remove danda if it's just a separator after ক/খ/গ/ঘ
        fixed = re.sub(r'^([কখগঘ])।', r'\1', fixed)

        # Replace numeral artifacts (but not single danda)
        for wrong, right in replacements.items():
            fixed = fixed.replace(wrong, right)

        # Fix Bangla syllable splits (ক া → কা)
        fixed = re.sub(r'([ক-হ])\s([ািীুূৃেৈোৌ])', r'\1\2', fixed)

        # Normalize spaces and commas
        fixed = re.sub(r'\s*,\s*', ', ', fixed)
        fixed = re.sub(r'\s+', ' ', fixed).strip()

        fixed_lines.append(fixed)

    return fixed_lines


In [None]:
def extract_lines_by_line(text):
    """
    Return the text as a list of lines, preserving everything exactly.
    No language detection or filtering — just splits by line.
    """
    lines = []
    for line in text.splitlines():
        clean_line = line.strip()
        if clean_line:
            lines.append(clean_line)
    return lines


In [None]:
def extract_tables_with_pdfplumber(pdf_path, page_number):
    """Extract tables from specific page using pdfplumber"""
    print("Extract tables from specific page using pdfplumber")
    tables_data = []
    with pdfplumber.open(pdf_path) as pdf:
        # pdfplumber uses 0-based indexing, so subtract 1 from page_number
        page = pdf.pages[page_number - 1]
        tables = page.extract_tables()

        for table_idx, table in enumerate(tables):
            print(f"Table {table_idx + 1} on page {page_number}:")
            table_rows = []
            for row in table:
                print(row)
                # Clean up None values and convert to strings
                cleaned_row = [str(cell) if cell is not None else "" for cell in row]
                table_rows.append(cleaned_row)  # Fixed: was "table_rows.menaing...append(cleaned_row)"
            tables_data.append(table_rows)
            print("-" * 50)

        return tables_data

In [None]:


DetectorFactory.seed = 0
# Unicode range for Bangla: \u0980-\u09FF
def contains_bangla(text):
    return bool(re.search(r'[\u0980-\u09FF]', text))

def is_mostly_punct(text, threshold=0.6):
    if not text:
        return True
    punct_count = sum(1 for c in text if c in string.punctuation or c.isspace())
    return (punct_count / len(text)) > threshold







def main_ocr(page_pixmap, color='g', black=120, psm=6, save_path=None, page_num=None):
    img_data = page_pixmap.tobytes("png")
    nparr = np.frombuffer(img_data, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if color == 'b':
        _, img_processed = cv2.threshold(gray_image, black, 255, cv2.THRESH_BINARY)
    else:
        img_processed = gray_image

    if save_path is not None and page_num is not None:
        os.makedirs(save_path, exist_ok=True)
        filename = os.path.join(save_path, f'page_{page_num+1}_processed.png')
        cv2.imwrite(filename, img_processed)
        print(f"Saved processed image: {filename}")

    text = pytesseract.image_to_string(img_processed, lang='ben', config=f'--oem 3 --psm {psm}')

    print("text........")
    print(text)
    lines = extract_lines_by_line(text)
    lines = fix_roman_numerals(lines)


    return lines




def page_has_table(pdf_path, page_number):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number]
        tables = page.find_tables()
        return len(tables) > 0






def extract_text_from_pdf(pdf_path, color='g', black=80, save_images=False):
    doc = fitz.open(pdf_path)
    all_pages = []
    save_path = "processed_images" if save_images else None

    for page_num in range(len(doc)):
        page_index = page_num + 1
        print(f"\nProcessing page {page_index}...")

        # Decide threshold (black value)
        if page_index in [2, 41]:
            black_val = 120
            print("Using black=120 for better contrast")
        else:
            black_val = black

        # force PSM/DPI
        if page_index == 41:
            dpi = 300
            psm = 6
            print("Page 41 forced — PSM 6, DPI 350")
        elif page_index == 2:
            dpi = 350
            psm = 6
            print("Page 2 forced — PSM 6, DPI 350")
        else:
            if page_has_table(pdf_path, page_num):
                dpi = 300
                psm = 11
                print("Table detected — using PSM 11 and DPI 300")
            else:
                dpi = 350
                psm = 6
                print("No table detected — using PSM 6 and DPI 350")

        # OCR
        pix = doc.load_page(page_num).get_pixmap(dpi=dpi)
        lines = main_ocr(pix, color=color, black=black_val, psm=psm, save_path=save_path, page_num=page_num)

        page_data = {
            "page": page_index,
            "lines": lines
        }

        # ADD THIS: For pages 2 and 41, also get OCR from table areas
        if page_index in [2, 41]:
            print(f"\n=== Extracting additional OCR from table areas on page {page_index} ===")
            table_ocr =extract_tables_with_pdfplumber(pdf_path, page_index)
            page_data["table_ocr"] = table_ocr

            # Also get the structured table data
            table_data = extract_tables_with_pdfplumber(pdf_path, page_index)
            page_data["table_data"] = table_data

        all_pages.append(page_data)

    return all_pages







In [None]:

output_dir = "raw_data_per_page"

def save_results(data, output_dir):
    """Save cleaned text to JSON and individual TXT files per page"""
    os.makedirs(output_dir, exist_ok=True)

    # Save complete data to JSON
    json_path = os.path.join(output_dir, "all_pages.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Save each page as separate TXT file
    for page in data:
        txt_path = os.path.join(output_dir, f"page_{page['page']}.txt")
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write("\n".join(page['lines']))

        # For pages 2 and 41, save additional table OCR data if available
        if page['page'] in [2, 41]:
            if 'table_ocr' in page:
                table_ocr_path = os.path.join(output_dir, f"page_{page['page']}_table_ocr.txt")
                with open(table_ocr_path, 'w', encoding='utf-8') as f:
                    # Handle case where table_ocr might contain lists
                    table_lines = []
                    for row in page['table_ocr']:
                        if isinstance(row, list):
        # Join cells with tabs to preserve row structure
                              table_lines.append("\t".join(str(cell) if cell else '' for cell in row))
                        else:
                             table_lines.append(str(row))
                    f.write("\n".join(table_lines))

                print(f"Saved table OCR for page {page['page']}: {table_ocr_path}")

            if 'table_data' in page:
                table_json_path = os.path.join(output_dir, f"page_{page['page']}_table_data.json")
                with open(table_json_path, 'w', encoding='utf-8') as f:
                    json.dump(page['table_data'], f, ensure_ascii=False, indent=2)
                print(f"Saved table data for page {page['page']}: {table_json_path}")

    print(f"Saved results to {output_dir}")
    print(f"- raw JSON: {json_path}")
    print(f"- raw TXT files: {len(data)} pages")

raw_data = []
pdf_path = "/content/HSC26-Bangla1st-Paper.pdf"
results = extract_text_from_pdf(pdf_path, color='b', black=80, save_images=True)

for page in results:
    print(f"\nPage {page['page']} - Extracted lines:")
    for line in page["lines"]:
        print(line)

    # Create the page data dictionary
    page_data = {
        "page": page["page"],
        "lines": page["lines"]
    }

    # Add table data for pages 2 and 41
    if page["page"] in [2, 41]:
        if "table_ocr" in page:
            page_data["table_ocr"] = page["table_ocr"]
        if "table_data" in page:
            page_data["table_data"] = page["table_data"]

    # Append the complete page data to raw_data
    raw_data.append(page_data)

save_results(raw_data, output_dir)

In [None]:


json_path = "/content/raw_data_per_page/all_pages.json"

# Load existing JSON
with open(json_path, 'r', encoding='utf-8') as f:
    pages = json.load(f)

# Process each page
for page in pages:
    # Remove table_data key if present
    page.pop('table_data', None)

    # Merge table_ocr into lines if it exists
    if 'table_ocr' in page and isinstance(page['table_ocr'], list):
        # Flatten table_ocr rows to strings (handle nested lists)
        merged_lines = []
        for row in page['table_ocr']:
            if isinstance(row, list):
                merged_lines.append("\t".join(str(cell) if cell else '' for cell in row))
            else:
                merged_lines.append(str(row))

        # Append to lines
        page['lines'].extend(merged_lines)

        # Remove the original table_ocr key
        page.pop('table_ocr', None)

# Save back the modified JSON (overwrite)
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(pages, f, ensure_ascii=False, indent=2)

print(f"Updated JSON saved to {json_path}")


In [None]:


input_json = "raw_data_per_page/all_pages.json"
output_cleaned_json = "clean_data/all_pages_cleaned.json"
output_chunks_json = "clean_data/faiss_ready_chunks.json"

# Clean each line
DASH_CHARS = r"[-‐‑‒–—―]"

def clean_line(line):
    # Remove table borders, pipes, dashes
    line = re.sub(r'[|│┃╏═─—_]+', '', line)
    line = re.sub(r'[-]{2,}', '', line)
    line = re.sub(r'\[', '', line)
    line = re.sub(DASH_CHARS, '', line)
    line = re.sub(r'\.{2,}', '.', line)

    # Remove unwanted symbols
    line = re.sub(r'[€\]\*]', '', line)

    # Remove digit sequences that are 4 or more digits long
    line = re.sub(r'[০-৯0-9]{4,}', '', line)

    # Normalize spaces
    line = re.sub(r'\s+', ' ', line).strip()
    line = line.replace('।', '.')

    # Remove batch tags and metadata
    line = re.sub(r'আনলাইন\s*ব্যাচ\s*[\w০-৯ঃ:]*', '', line)
    line = re.sub(r'\bSL\s*Ans\b', '', line, flags=re.IGNORECASE)
    line = re.sub(r'\bPage\s*\d+\b', '', line, flags=re.IGNORECASE)

    # Remove leading/trailing quotes
    line = line.strip(' "\'”“‘’')

    # Skip empty lines (but **keep single and double digits**)
    if not line:
        return None

    return line







# -------- Step 2: Load and deduplicate --------
with open(input_json, "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned_pages = {}

for entry in data:
    page = entry.get("page")
    if page == 1:
        continue
    lines = entry.get("lines", [])

    # --- Special case for page 41: keep only lines from "SL" onward ---
    if page == 41:
        for idx, line in enumerate(lines):
            if "SL" in line or "Sl" in line or "sl" in line:
                # Keep only lines from here onward
                lines = lines[idx:]
                break

    if page not in cleaned_pages:
        cleaned_lines = []
        for line in lines:
            cleaned = clean_line(line)
            if cleaned:
                cleaned_lines.append(cleaned)
        cleaned_pages[page] = cleaned_lines


# Save cleaned per-page JSON
final_cleaned = [{"page": p, "lines": cleaned_pages[p]} for p in sorted(cleaned_pages.keys())]
with open(output_cleaned_json, "w", encoding="utf-8") as f:
    json.dump(final_cleaned, f, ensure_ascii=False, indent=2)

print(f"Cleaned per-page text saved to: {output_cleaned_json} (Pages: {len(final_cleaned)})")

# -------- Step 3: Chunk for E5 embeddings --------
def chunk_for_e5(text, max_chars=1200, overlap=200):
    """
    Break text into chunks (~512 tokens) for multilingual-e5-base.
    Adds "passage:" prefix for each chunk as required by E5 models.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(f"passage: {chunk}")
        start += max_chars - overlap
    return chunks

all_chunks = []
for page_data in final_cleaned:
    page_num = page_data["page"]
    full_text = " ".join(page_data["lines"])

    chunks = chunk_for_e5(full_text, max_chars=1200, overlap=200)
    for i, chunk in enumerate(chunks, start=1):
        all_chunks.append({
            "id": f"page{page_num}_chunk{i}",
            "page": page_num,
            "text": chunk
        })

# Save FAISS-ready chunks JSON
with open(output_chunks_json, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"FAISS-ready chunks saved to: {output_chunks_json} (Total chunks: {len(all_chunks)})")


In [None]:


with open("clean_data/faiss_ready_chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

print(f"Total chunks: {len(chunks)}")


In [None]:
#Load Embedding Model

embedder = SentenceTransformer("intfloat/multilingual-e5-base")

In [None]:


# Preprocess and embed
with open("clean_data/faiss_ready_chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [c["text"] for c in chunks]
embeddings = embedder.encode(texts, normalize_embeddings=True)

In [None]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Number of chunks (embeddings): {embeddings.shape[0]}")
print(f"Embedding dimension: {embeddings.shape[1]}")


In [None]:


embeddings_np = np.array(embeddings, dtype='float32')


In [None]:
embeddings_np.shape

In [None]:


dim = embeddings_np.shape[1]  # should be 768 for multilingual-e5-base
index = faiss.IndexFlatIP(dim)  # Inner product (cosine similarity)
index.add(embeddings_np)        # Add all your vectors
print(f"FAISS index built with {index.ntotal} vectors")


In [None]:
gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl")
gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")

In [None]:
def build_prompt_from_query(query, embedder, index, chunks, top_k=1):
    # Embed the query
    query_emb = embedder.encode([query], normalize_embeddings=True)
    query_emb = np.array(query_emb, dtype='float32')

    # Search top_k relevant chunks
    scores, indices = index.search(query_emb, top_k)
    indices = indices[0]
    scores = scores[0]

    # Filter out invalid indices if any
    valid_indices = [i for i in indices if i < len(chunks)]

    # Retrieve the text chunks
    retrieved_docs = [chunks[idx]['text'] for idx in valid_indices]

    # Print retrieved chunks for debugging
    print("Retrieved chunks for prompt:")
    for i, text in enumerate(retrieved_docs, 1):
        print(f"Chunk {i}:")
        split_text = re.split(r'[।?\.]', text)
        for sentence in split_text:
            sentence = sentence.strip()
            if sentence:
                print(sentence)
        print('-' * 50)

    # Build the final prompt
    context = "\n".join(retrieved_docs)
    prompt = (
        f"প্রশ্ন: {query}\n"
        f"নীচের তথ্য থেকে প্রশ্নের সঠিক উত্তর খুঁজে বের করুন।\n"
        f"উত্তরটি এমন অংশ থেকে নিন যেটাতে 'SL Ans', 'উত্তর', বা সমমানের অন্য কোনো চিহ্ন রয়েছে।\n"
        f"যদি স্পষ্ট উত্তর না পাওয়া যায়, তাহলে 'উত্তর পাওয়া যায়নি' লিখুন।\n"
        f"উত্তর বাংলায় লিখুন:\n\n"
        f"{context}\n"
    )

    # Return both: the prompt and the retrieved chunks
    return prompt, retrieved_docs


In [None]:
query = "কে পণ করিয়াছে বিবাহ করিবে না।"
prompt = build_prompt_from_query(query, embedder, index, chunks, top_k=1)
print("Prompt for model:\n", prompt)


In [None]:
query = "অনুপমের বাবা কী করে জীবিকা নির্বাহ করতেন?"
prompt = build_prompt_from_query(query, embedder, index, chunks, top_k=1)
print("Prompt for model:\n", prompt)


In [None]:
query = "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?"
prompt = build_prompt_from_query(query, embedder, index, chunks, top_k=1)
print("Prompt for model:\n", prompt)


In [None]:
query = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
prompt = build_prompt_from_query(query, embedder, index, chunks, top_k=1)
print("Prompt for model:\n", prompt)
