In [None]:
# ============ CELL 1: Setup ============
!pip install spacy sentence-transformers
!python -m spacy download en_core_web_sm

from google.colab import drive
drive.mount('/content/drive')


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m128.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Mounted at /content/drive


In [None]:

# ============ CELL 2: OPEA Chunking Microservice ============
import json
import spacy
from typing import List, Dict
from dataclasses import dataclass, asdict
import hashlib

@dataclass
class TextChunk:
    """Data model for text chunks"""
    chunk_id: str
    text: str
    grade: int
    subject: str
    language: str
    chapter: str
    page_num: int
    section: str
    chunk_index: int
    token_count: int
    metadata: Dict

class OPEAChunkingService:
    """OPEA Microservice for Semantic Chunking"""

    def __init__(self, chunk_size: int = 400, overlap: int = 50):
        self.chunk_size = chunk_size  # tokens
        self.overlap = overlap
        self.nlp = spacy.load("en_core_web_sm")

    def chunk_extracted_data(self, extracted_file_path: str) -> List[TextChunk]:
        """Main chunking pipeline"""
        # Load extracted data
        with open(extracted_file_path, 'r', encoding='utf-8') as f:
            extracted_data = json.load(f)

        all_chunks = []

        for page_data in extracted_data:
            chunks = self._chunk_page(page_data)
            all_chunks.extend(chunks)

        return all_chunks

    def _chunk_page(self, page_data: Dict) -> List[TextChunk]:
        """Chunk a single page"""
        text = page_data['text']
        doc = self.nlp(text)

        # Split into sentences
        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]

        chunks = []
        current_chunk = []
        current_tokens = 0
        chunk_index = 0

        for sentence in sentences:
            sentence_tokens = len(sentence.split())

            if current_tokens + sentence_tokens > self.chunk_size and current_chunk:
                # Create chunk
                chunk_text = ' '.join(current_chunk)
                chunks.append(self._create_chunk(
                    chunk_text,
                    page_data,
                    chunk_index
                ))
                chunk_index += 1

                # Keep overlap
                overlap_sentences = current_chunk[-(self.overlap // 50):]  # Approximate
                current_chunk = overlap_sentences + [sentence]
                current_tokens = sum(len(s.split()) for s in current_chunk)
            else:
                current_chunk.append(sentence)
                current_tokens += sentence_tokens

        # Add remaining chunk
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append(self._create_chunk(
                chunk_text,
                page_data,
                chunk_index
            ))

        return chunks

    def _create_chunk(self, text: str, page_data: Dict, chunk_index: int) -> TextChunk:
        """Create TextChunk object"""
        # Generate unique ID
        chunk_id = hashlib.md5(
            f"{page_data['source_file']}_{page_data['page_num']}_{chunk_index}".encode()
        ).hexdigest()[:12]

        # Extract chapter/section (simple heuristic)
        chapter, section = self._extract_structure(text)

        return TextChunk(
            chunk_id=chunk_id,
            text=text,
            grade=page_data['grade'],
            subject=page_data['subject'],
            language=page_data['language'],
            chapter=chapter,
            page_num=page_data['page_num'],
            section=section,
            chunk_index=chunk_index,
            token_count=len(text.split()),
            metadata={
                'source_file': page_data['source_file'],
                'extraction_method': page_data['extraction_method'],
                'confidence': page_data.get('confidence', 1.0)
            }
        )

    def _extract_structure(self, text: str) -> tuple:
        """Extract chapter and section from text (basic heuristic)"""
        # Look for patterns like "Chapter 1: Numbers"
        import re

        chapter_match = re.search(r'Chapter\s+(\d+)[:.\s]+([^\n]+)', text, re.IGNORECASE)
        chapter = chapter_match.group(0) if chapter_match else "Unknown"

        # Section detection (simplified)
        section_match = re.search(r'(\d+\.\d+)\s+([A-Z][^\n]+)', text)
        section = section_match.group(0) if section_match else "General"

        return chapter, section

    def save_chunks(self, chunks: List[TextChunk], output_path: str):
        """Save chunks as JSON"""
        chunks_dict = [asdict(chunk) for chunk in chunks]

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks_dict, f, ensure_ascii=False, indent=2)

        print(f"✓ Saved {len(chunks)} chunks to {output_path}")

# ============ CELL 3: Process Extracted Files ============
chunking_service = OPEAChunkingService(chunk_size=400, overlap=50)

extracted_files = [
    '/content/drive/MyDrive/ncert_processed/6_science_english_extracted.json',
    '/content/drive/MyDrive/ncert_processed/6_science_hindi_extracted.json',

    # Add more files
]

for extracted_file in extracted_files:
    print(f"Processing {extracted_file}...")

    chunks = chunking_service.chunk_extracted_data(extracted_file)

    # Save chunks
    output_filename = extracted_file.replace('_extracted.json', '_chunks.json')
    chunking_service.save_chunks(chunks, output_filename)

print("\n✅ Chunking completed!")

# ============ CELL 4: Statistics ============
with open(output_filename, 'r') as f:
    chunks_data = json.load(f)

print(f"Total chunks: {len(chunks_data)}")
print(f"Average chunk size: {sum(c['token_count'] for c in chunks_data) / len(chunks_data):.1f} tokens")
print(f"\nSample chunk:")
print(json.dumps(chunks_data[10], indent=2))

Processing /content/drive/MyDrive/ncert_processed/6_science_english_extracted.json...
✓ Saved 162 chunks to /content/drive/MyDrive/ncert_processed/6_science_english_chunks.json
Processing /content/drive/MyDrive/ncert_processed/6_science_hindi_extracted.json...
✓ Saved 181 chunks to /content/drive/MyDrive/ncert_processed/6_science_hindi_chunks.json

✅ Chunking completed!
Total chunks: 181
Average chunk size: 259.7 tokens

Sample chunk:
{
  "chunk_id": "f4d3f4226a02",
  "text": "\u0935\u0926\u092f\u0930\u0925\u092f \u0915 \u0932\u090f \u0938\u0926\u0936 \u0907\u0938 \u092a\u0926\u092f\u092a\u0938\u0924\u0915 \u0915 \u0905\u0927\u092f\u092f\u0928 \u0915 \u092f\u0924\u0930 \u092e \u092a\u0939\u0932 26 5\u096a) \u0914\u0930 \u092c\u091d \u0915 \u091f\u092e \u0938\u0926\u0935 \u0906\u092a\u0915 \u0938\u0925 \u0930\u0939\u0917 \u0909\u0928\u0939 4 \u092a\u0930\u0936\u0928 \u092a\u091b\u0928 \u092c\u0939\u0924 \u0905\u0927\u0915 \u092a\u0938\u0926 \u0939 \u092c\u0939\u0924 \u092a\u0930\u0915\u