# Telugu OCR Dataset Builder
Build high-quality OCR training dataset from page images + ground truth

## Setup
1. Upload `source_images/` and `ground_truth/` to Google Drive
2. Run this notebook

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
!apt-get install -y tesseract-ocr tesseract-ocr-tel
!pip install pytesseract rapidfuzz opencv-python tqdm

In [None]:
# Config - UPDATE THESE PATHS
DRIVE_ROOT = "/content/drive/MyDrive/telugu-ocr"  # Your Drive folder
SOURCE_IMAGES = f"{DRIVE_ROOT}/source_images/images"
GROUND_TRUTH = f"{DRIVE_ROOT}/ground_truth"
OUTPUT_DIR = f"{DRIVE_ROOT}/dataset"

MIN_LINES = 4
MIN_MATCH_SCORE = 95

In [None]:
import pytesseract
import cv2
import numpy as np
import json
import shutil
import re
import os
from pathlib import Path
from collections import defaultdict
from rapidfuzz import fuzz
from tqdm.notebook import tqdm

SOURCE_IMAGES_DIR = Path(SOURCE_IMAGES)
GROUND_TRUTH_DIR = Path(GROUND_TRUTH)
OUTPUT = Path(OUTPUT_DIR)

def split_into_sentences(text):
    chunks = re.split(r'[.\nà¥¤]+', text)
    return [s.strip() for s in chunks if s.strip() and len(s.strip()) > 5]

def find_best_sentence_range(tesseract_text, sentences):
    if not tesseract_text.strip() or not sentences:
        return None, 0
    clean_tess = " ".join(tesseract_text.split())
    tess_len = len(clean_tess)
    best_match, best_score = None, 0
    n = len(sentences)
    for size in range(1, min(16, n + 1)):
        for start in range(n - size + 1):
            combined = " ".join(sentences[start:start + size])
            if len(combined) < tess_len * 0.7:
                continue
            score = fuzz.ratio(clean_tess, combined)
            if score > best_score:
                best_score, best_match = score, combined
    return best_match, best_score

def build_story_index():
    print("Building story index...")
    page_to_story = {}
    for json_file in tqdm(list(GROUND_TRUTH_DIR.rglob("*.json"))):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except: continue
        pdf_stem = json_file.stem
        stories = data.get("stories", [])
        if not stories and "story" in data: stories = [data["story"]]
        if not stories and "content" in data: stories = [data]
        for story in stories:
            content = story.get("content", "")
            if not content: continue
            start_page = story.get("pdf_page_start", 1)
            end_page = story.get("pdf_page_end", start_page)
            for page in range(start_page, end_page + 1):
                key = (pdf_stem, page)
                page_to_story[key] = page_to_story.get(key, "") + "\n" + content
    print(f"Indexed {len(page_to_story)} page mappings")
    return page_to_story

def parse_page_number(filename):
    stem = Path(filename).stem
    match = re.search(r'page[_-]?(\d+)', stem, re.IGNORECASE)
    if match: return int(match.group(1))
    match = re.search(r'(\d+)$', stem)
    if match: return int(match.group(1))
    return None

def extract_and_align(img_path, gt_text, output_dir, unique_id):
    results = []
    img = cv2.imread(str(img_path))
    if img is None: return results
    try:
        data = pytesseract.image_to_data(img, lang='tel', output_type=pytesseract.Output.DICT)
    except: return results
    paragraphs = defaultdict(list)
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        conf = int(data['conf'][i])
        if conf > 0 and text:
            key = (data['block_num'][i], data['par_num'][i])
            paragraphs[key].append({
                'text': text, 'x': data['left'][i], 'y': data['top'][i],
                'w': data['width'][i], 'h': data['height'][i], 'line_num': data['line_num'][i]
            })
    sentences = split_into_sentences(gt_text)
    if not sentences: return results
    img_h, img_w = img.shape[:2]
    para_idx = 0
    for key, words in paragraphs.items():
        if not words: continue
        x_min = min(w['x'] for w in words) - 10
        y_min = min(w['y'] for w in words) - 10
        x_max = max(w['x'] + w['w'] for w in words) + 10
        y_max = max(w['y'] + w['h'] for w in words) + 10
        x_min, y_min = max(0, x_min), max(0, y_min)
        x_max, y_max = min(img_w, x_max), min(img_h, y_max)
        if x_max - x_min < 50 or y_max - y_min < 20: continue
        lines_dict = defaultdict(list)
        for w in sorted(words, key=lambda w: (w['line_num'], w['x'])):
            lines_dict[w['line_num']].append(w['text'])
        if len(lines_dict) < MIN_LINES: continue
        tess_text = '\n'.join(' '.join(ws) for ws in lines_dict.values())
        matched_gt, score = find_best_sentence_range(tess_text, sentences)
        if score < MIN_MATCH_SCORE or not matched_gt: continue
        para_crop = img[y_min:y_max, x_min:x_max]
        para_id = f"{unique_id}_para_{para_idx:02d}"
        cv2.imwrite(str(output_dir / f"{para_id}.jpg"), para_crop)
        results.append({"id": para_id, "image": f"{para_id}.jpg", "text": matched_gt, "match_score": score, "line_count": len(lines_dict)})
        para_idx += 1
    return results

In [None]:
# Run Pipeline
print("ðŸš€ Telugu OCR Dataset Builder")

# Setup output
if OUTPUT.exists(): shutil.rmtree(OUTPUT)
OUTPUT.mkdir(parents=True)
(OUTPUT / "images").mkdir()

# Build index
page_to_story = build_story_index()

# Get all images
all_images = list(SOURCE_IMAGES_DIR.rglob("*.jpg"))
print(f"Total images: {len(all_images)}")

# Process
all_results = []
stats = {"processed": 0, "with_gt": 0, "extracted": 0}

for img_path in tqdm(all_images):
    stats["processed"] += 1
    pdf_folder = img_path.parent.name
    pdf_stem = pdf_folder.replace(" ", "_")
    page_num = parse_page_number(img_path.name)
    if page_num is None: continue
    key = (pdf_stem, page_num)
    gt_text = page_to_story.get(key)
    if not gt_text: continue
    stats["with_gt"] += 1
    year = img_path.parent.parent.name
    unique_id = f"{year}_{pdf_stem}_p{page_num:03d}"
    results = extract_and_align(img_path, gt_text, OUTPUT / "images", unique_id)
    if results:
        stats["extracted"] += len(results)
        all_results.extend(results)

# Save
with open(OUTPUT / "metadata.jsonl", 'w', encoding='utf-8') as f:
    for item in all_results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\nâœ… Complete!")
print(f"   Processed: {stats['processed']}")
print(f"   With GT: {stats['with_gt']}")
print(f"   High-confidence: {stats['extracted']}")
print(f"   Output: {OUTPUT}")

In [None]:
# Check results
!wc -l {OUTPUT}/metadata.jsonl
!ls {OUTPUT}/images | head -20