In [None]:
# ============ CELL 1: Setup ============
!pip install sentence-transformers chromadb torch

from google.colab import drive
drive.mount('/content/drive')

import torch
print(f"GPU available: {torch.cuda.is_available()}")


Collecting chromadb
  Downloading chromadb-1.2.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?

In [None]:

# ============ CELL 2: OPEA Embedding Microservice ============
import json
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import numpy as np
from tqdm import tqdm

class OPEAEmbeddingService:
    """OPEA Microservice for Text Embeddings"""

    def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.embedding_dim = self.model.get_sentence_embedding_dimension()
        print(f"✓ Model loaded (dimension: {self.embedding_dim})")

    def generate_embeddings(self, chunks: List[Dict], batch_size: int = 32) -> List[Dict]:
        """Generate embeddings for chunks"""
        texts = [chunk['text'] for chunk in chunks]

        # Generate embeddings in batches
        all_embeddings = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(
                batch_texts,
                show_progress_bar=False,
                convert_to_numpy=True
            )
            all_embeddings.extend(batch_embeddings)

        # Add embeddings to chunks
        for chunk, embedding in zip(chunks, all_embeddings):
            chunk['embedding'] = embedding.tolist()

        return chunks

    def save_embedded_chunks(self, chunks: List[Dict], output_path: str):
        """Save chunks with embeddings"""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, ensure_ascii=False, indent=2)

        print(f"✓ Saved {len(chunks)} embedded chunks to {output_path}")

# ============ CELL 3: Generate Embeddings ============
embedding_service = OPEAEmbeddingService("intfloat/multilingual-e5-large")

chunk_files = [
    '/content/drive/MyDrive/ncert_processed/6_science_english_chunks.json',
    '/content/drive/MyDrive/ncert_processed/6_science_hindi_chunks.json',
    # Add more files
]

for chunk_file in chunk_files:
    print(f"\nProcessing {chunk_file}...")

    # Load chunks
    with open(chunk_file, 'r') as f:
        chunks = json.load(f)

    # Generate embeddings
    embedded_chunks = embedding_service.generate_embeddings(chunks, batch_size=32)

    # Save
    output_file = chunk_file.replace('_chunks.json', '_embedded.json')
    embedding_service.save_embedded_chunks(embedded_chunks, output_file)

print("\n✅ Embedding generation completed!")

Loading embedding model: intfloat/multilingual-e5-large


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

✓ Model loaded (dimension: 1024)

Processing /content/drive/MyDrive/ncert_processed/6_science_english_chunks.json...


Generating embeddings: 100%|██████████| 6/6 [00:14<00:00,  2.37s/it]


✓ Saved 162 embedded chunks to /content/drive/MyDrive/ncert_processed/6_science_english_embedded.json

Processing /content/drive/MyDrive/ncert_processed/6_science_hindi_chunks.json...


Generating embeddings: 100%|██████████| 6/6 [00:15<00:00,  2.62s/it]


✓ Saved 181 embedded chunks to /content/drive/MyDrive/ncert_processed/6_science_hindi_embedded.json

✅ Embedding generation completed!
