In [None]:
# ============ CELL 1: Setup ============
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-urd poppler-utils
!pip install --upgrade pip
!pip install pytesseract pdf2image PyPDF2 pillow tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create output folder
import os
output_folder = '/content/drive/MyDrive/ncert_processed'
os.makedirs(output_folder, exist_ok=True)
print("Output folder ready at:", output_folder)


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading packag

In [None]:
# ============ CELL 2: OCR Microservice ============
import os
import json
from dataclasses import dataclass, asdict
from typing import List, Dict
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
import re
from concurrent.futures import ThreadPoolExecutor

@dataclass
class ExtractedText:
    text: str
    source_file: str
    grade: int
    subject: str
    language: str
    page_num: int
    extraction_method: str
    confidence: float = 1.0

class OPEAOCRService:
    def __init__(self, supported_languages: List[str] = None):
        self.supported_languages = supported_languages or ['eng','hin','urd']
        self.tesseract_config = '--oem 3 --psm 6'

    def extract_from_pdf(self, pdf_path: str, metadata: Dict, dpi: int=200, use_confidence: bool=False) -> List[ExtractedText]:
        # Try native PDF extraction
        try:
            data = self._extract_native_pdf(pdf_path, metadata)
            if data and len(data[0].text) > 100:
                print(f"✓ Native extraction successful: {pdf_path}")
                return data
        except Exception as e:
            print(f"Native extraction failed: {e}")

        print(f"Using OCR for: {pdf_path}")
        return self._extract_with_ocr(pdf_path, metadata, dpi, use_confidence)

    def _extract_native_pdf(self, pdf_path: str, metadata: Dict):
        reader = PdfReader(pdf_path)
        extracted = []
        for page_num, page in enumerate(reader.pages, 1):
            text = page.extract_text() or ''
            if text.strip():
                extracted.append(ExtractedText(
                    text=self._clean_text(text),
                    source_file=pdf_path,
                    grade=metadata['grade'],
                    subject=metadata['subject'],
                    language=metadata['language'],
                    page_num=page_num,
                    extraction_method='native'
                ))
        return extracted

    def _ocr_page(self, page_tuple, lang_code, use_confidence):
        page_num, image, source_file, metadata = page_tuple
        text = pytesseract.image_to_string(image, lang=lang_code, config=self.tesseract_config)
        avg_confidence = 1.0

        if use_confidence:
            data = pytesseract.image_to_data(image, lang=lang_code, output_type='dict')
            confidences = [int(conf) for conf in data['conf'] if conf != '-1']
            avg_confidence = sum(confidences)/len(confidences)/100.0 if confidences else 0

        print(f"✅ Processed page {page_num} of {metadata['subject']} ({metadata['language']})")
        return ExtractedText(
            text=self._clean_text(text),
            source_file=source_file,
            grade=metadata['grade'],
            subject=metadata['subject'],
            language=metadata['language'],
            page_num=page_num,
            extraction_method='ocr',
            confidence=avg_confidence
        )

    def _extract_with_ocr(self, pdf_path: str, metadata: Dict, dpi: int=200, use_confidence: bool=False):
        images = convert_from_path(pdf_path, dpi=dpi)
        lang_code = self._get_tesseract_lang(metadata['language'])
        page_tuples = [(i+1, img, pdf_path, metadata) for i, img in enumerate(images)]

        extracted = []
        with ThreadPoolExecutor(max_workers=4) as executor:
            for result in executor.map(lambda args: self._ocr_page(args, lang_code, use_confidence), page_tuples):
                extracted.append(result)
        return extracted

    def _get_tesseract_lang(self, language: str) -> str:
        lang_map = {'english':'eng', 'hindi':'hin', 'urdu':'urd'}
        return lang_map.get(language.lower(), 'eng')

    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\?\!\:\;\-\(\)]','',text)
        return text.strip()

    def save_extracted_data(self, extracted: List[ExtractedText], output_path: str):
        data_dict = [asdict(e) for e in extracted]
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data_dict, f, ensure_ascii=False, indent=2)
        print(f"✓ Saved {len(extracted)} pages to {output_path}")


In [None]:
# ============ CELL 3: Process PDFs ============
from tqdm import tqdm

ocr_service = OPEAOCRService()

pdf_files = [
    {'path':'/content/science.pdf','metadata':{'grade':6,'subject':'science','language':'english'}},
    {'path':'/content/Vigyan.pdf','metadata':{'grade':6,'subject':'science','language':'hindi'}}
]

for pdf_info in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = pdf_info['path']
    metadata = pdf_info['metadata']

    extracted = ocr_service.extract_from_pdf(pdf_path, metadata, dpi=200, use_confidence=False)

    output_filename = f"{metadata['grade']}_{metadata['subject']}_{metadata['language']}_extracted.json"
    output_path = os.path.join(output_folder, output_filename)

    ocr_service.save_extracted_data(extracted, output_path)

print("\n✅ OCR extraction completed!")


Processing PDFs:   0%|          | 0/2 [00:00<?, ?it/s]

Using OCR for: /content/science.pdf
✅ Processed page 4 of science (english)
✅ Processed page 1 of science (english)
✅ Processed page 6 of science (english)
✅ Processed page 5 of science (english)
✅ Processed page 2 of science (english)
✅ Processed page 7 of science (english)
✅ Processed page 3 of science (english)
✅ Processed page 9 of science (english)
✅ Processed page 11 of science (english)
✅ Processed page 8 of science (english)
✅ Processed page 10 of science (english)
✅ Processed page 12 of science (english)
✅ Processed page 15 of science (english)
✅ Processed page 13 of science (english)
✅ Processed page 14 of science (english)
✅ Processed page 17 of science (english)
✅ Processed page 16 of science (english)
✅ Processed page 18 of science (english)
✅ Processed page 21 of science (english)
✅ Processed page 19 of science (english)
✅ Processed page 22 of science (english)
✅ Processed page 20 of science (english)
✅ Processed page 23 of science (english)
✅ Processed page 24 of science

Processing PDFs:  50%|█████     | 1/2 [15:26<15:26, 926.48s/it]

✅ Processed page 132 of science (english)
✓ Saved 136 pages to /content/drive/MyDrive/ncert_processed/6_science_english_extracted.json
Using OCR for: /content/Vigyan.pdf
✅ Processed page 4 of science (hindi)
✅ Processed page 1 of science (hindi)
✅ Processed page 6 of science (hindi)
✅ Processed page 5 of science (hindi)
✅ Processed page 2 of science (hindi)
✅ Processed page 7 of science (hindi)
✅ Processed page 3 of science (hindi)
✅ Processed page 11 of science (hindi)
✅ Processed page 12 of science (hindi)
✅ Processed page 9 of science (hindi)
✅ Processed page 10 of science (hindi)
✅ Processed page 8 of science (hindi)
✅ Processed page 13 of science (hindi)
✅ Processed page 15 of science (hindi)
✅ Processed page 14 of science (hindi)
✅ Processed page 17 of science (hindi)
✅ Processed page 16 of science (hindi)
✅ Processed page 18 of science (hindi)
✅ Processed page 21 of science (hindi)
✅ Processed page 22 of science (hindi)
✅ Processed page 19 of science (hindi)
✅ Processed page 20 

Processing PDFs: 100%|██████████| 2/2 [35:06<00:00, 1053.10s/it]

✅ Processed page 134 of science (hindi)
✓ Saved 136 pages to /content/drive/MyDrive/ncert_processed/6_science_hindi_extracted.json

✅ OCR extraction completed!





In [None]:
# ============ CELL 4: Quality Check ============
# Preview one JSON
import json
with open(output_path, 'r', encoding='utf-8') as f:
    sample_data = json.load(f)

print(f"Total pages extracted: {len(sample_data)}")
if len(sample_data) >= 5:
    print(f"\nSample page (page 5):")
    print(f"Text preview: {sample_data[10]['text'][:500]}...")
    print(f"Extraction method: {sample_data[4]['extraction_method']}")
    print(f"Confidence: {sample_data[4]['confidence']:.2f}")


Total pages extracted: 136

Sample page (page 5):
Text preview: वषय-सच आमख य अधयय  भजन क घटक  अधयय 2 वसतओ क समह बनन व अधयय 3 पदरथ क पथककरण 20 अधयय 4 पध क जनए 34 अधयय 5 शरर म गत 45 अधयय 6 सजव - वशषतए एव आवस 58 अधयय 7 गत एव दरय क मपन पव अधयय 8 परकश-छयए एव परवरतन 86 रवव079560 2023-24...
Extraction method: ocr
Confidence: 1.00
