In [2]:
import fitz
import re
from pinecone_text.sparse import BM25Encoder
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, PodSpec, ServerlessSpec
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
from pinecone import Pinecone

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_3kZjkt_5iuLqBterWafTCFYMduqJixnwq9pQSxSg1eZZ6TAZPr3eVag9eFru5HPxPVZsBz")
PINECONE_INDEX_NAME = "rag-policy-serverless-e5"

pc = Pinecone(api_key=PINECONE_API_KEY)
if PINECONE_INDEX_NAME in [index.name for index in pc.list_indexes()]:
    print(f"Deleting existing index: {PINECONE_INDEX_NAME}")
    pc.delete_index(PINECONE_INDEX_NAME)
    print("Index deleted.")

In [10]:
import fitz
import re
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
import os
import time

# --- 1. CONFIGURATION ---
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_3kZjkt_5iuLqBterWafTCFYMduqJixnwq9pQSxSg1eZZ6TAZPr3eVag9eFru5HPxPVZsBz")
PINECONE_INDEX_NAME = "rag-policy-serverless-e5" # Using a new index name for clarity
PDF_PATH = r"E:\hackathons\Bajaj Hackrx\2-stroma-rag\Arogya Sanjeevani Policy - CIN - U10200WB1906GOI001713 1.pdf"

# MODIFIED: Switched back to the Hugging Face model
EMBEDDING_MODEL = "jinaai/jina-embeddings-v4"
DIMENSION = 2048  # MODIFIED: Dimension for e5-large-v2 is 1024

# --- 2. ADVANCED CHUNKER ---
def clause_chunker(document_text):
    """Splits a document based on section headers."""
    pattern = r'\n(?=\d+\.\d+\.|\d+\.|\([a-z]\)|\([ivx]+\))'
    clauses = re.split(pattern, document_text)
    final_chunks = []
    buffer = ""
    for clause in clauses:
        if not clause.strip():
            continue
        if len(buffer) + len(clause) < 200:
            buffer += clause + "\n\n"
        else:
            if buffer:
                final_chunks.append(buffer.strip())
            buffer = clause
    if buffer:
        final_chunks.append(buffer.strip())
    return final_chunks

# --- 3. INGESTION SCRIPT ---
if __name__ == "__main__":
    # --- Initialize Local Embedding Model ---
    print(f"Initializing embedding model: {EMBEDDING_MODEL}...")
    # NEW: Load the Hugging Face model into memory
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={'trust_remote_code': True },
        encode_kwargs={'task': 'retrieval'}   # This is required for the Jina model
    )

    print("Embedding model loaded.")

    # --- Load and Process PDF ---
    print("Loading and processing PDF...")
    doc = fitz.open(PDF_PATH)
    full_text = "\n".join([page.get_text() for page in doc])
    doc.close()

    print("Chunking document by clauses...")
    chunks = clause_chunker(full_text)
    print(f"Created {len(chunks)} clause-based chunks.")

    # --- Create or Recreate Pinecone Serverless Index ---
    pc = Pinecone(api_key=PINECONE_API_KEY)
    if PINECONE_INDEX_NAME in [index.name for index in pc.list_indexes()]:
        print(f"Deleting existing index: {PINECONE_INDEX_NAME}")
        pc.delete_index(PINECONE_INDEX_NAME)
        
    print(f"Creating new SERVERLESS index: {PINECONE_INDEX_NAME}")
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=DIMENSION, # Use the 1024 dimension
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    while not pc.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)
    index = pc.Index(PINECONE_INDEX_NAME)
    print("Serverless index is ready.")

    # --- Embed and Upsert Data ---
    print("Embedding and upserting data to Pinecone...")
    batch_size = 32
    for i in range(0, len(chunks), batch_size):
        i_end = min(i + batch_size, len(chunks))
        batch_chunks = chunks[i:i_end]
        
        # MODIFIED: Create dense embeddings using the local Hugging Face model
        dense_embeds = embedding_model.embed_documents(batch_chunks)
        
        to_upsert = []
        for j, chunk in enumerate(batch_chunks):
            to_upsert.append({
                "id": f"chunk_{i+j}",
                "values": dense_embeds[j],
                "metadata": {"text": chunk}
            })
            
        index.upsert(vectors=to_upsert)
    
    print("Ingestion complete.")
    print(index.describe_index_stats())

Initializing embedding model: jinaai/jina-embeddings-v4...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.35it/s]
Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]


Embedding model loaded.
Loading and processing PDF...
Chunking document by clauses...
Created 155 clause-based chunks.
Deleting existing index: rag-policy-serverless-e5
Creating new SERVERLESS index: rag-policy-serverless-e5
Serverless index is ready.
Embedding and upserting data to Pinecone...


KeyboardInterrupt: 

data ingestion using googlge gemini embeddingmodel

In [10]:
import fitz
import re
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec
import os
import time

# --- 1. CONFIGURATION ---
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_3kZjkt_5iuLqBterWafTCFYMduqJixnwq9pQSxSg1eZZ6TAZPr3eVag9eFru5HPxPVZsBz")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "AIzaSyB7lWv0aVmmDuGqs5uAn8-idWFN2I8evLM")

PINECONE_INDEX_NAME = "rag-policy-serverless" # New index name for clarity
PDF_PATH = r"E:\Projects\hackrx_bajaj_finserv\ICIHLIP22012V012223.pdf"

# Gemini Embedding Model Configuration
EMBEDDING_MODEL = "models/text-embedding-004"
DIMENSION = 768

# Configure the Gemini client
genai.configure(api_key=GOOGLE_API_KEY)

# --- 2. ADVANCED CHUNKER ---
def clause_chunker(document_text):
    """Splits a document based on section headers."""
    pattern = r'\n(?=\d+\.\d+\.|\d+\.|\([a-z]\)|\([ivx]+\))'
    clauses = re.split(pattern, document_text)
    final_chunks = []
    buffer = ""
    for clause in clauses:
        if not clause.strip():
            continue
        if len(buffer) + len(clause) < 200:
            buffer += clause + "\n\n"
        else:
            if buffer:
                final_chunks.append(buffer.strip())
            buffer = clause
    if buffer:
        final_chunks.append(buffer.strip())
    return final_chunks

# --- 3. INGESTION SCRIPT ---
if __name__ == "__main__":
    print("Loading and processing PDF...")
    doc = fitz.open(PDF_PATH)
    full_text = "\n".join([page.get_text() for page in doc])
    doc.close()

    print("Chunking document by clauses...")
    chunks = clause_chunker(full_text)
    print(f"Created {len(chunks)} clause-based chunks.")

    # --- Create or Recreate Pinecone Serverless Index ---
    pc = Pinecone(api_key=PINECONE_API_KEY)
    if PINECONE_INDEX_NAME in [index.name for index in pc.list_indexes()]:
        print(f"Deleting existing index: {PINECONE_INDEX_NAME}")
        pc.delete_index(PINECONE_INDEX_NAME)
        
    print(f"Creating new SERVERLESS index: {PINECONE_INDEX_NAME}")
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=DIMENSION,
        metric="dotproduct", # Recommended for models with normalized embeddings
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    while not pc.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)
    index = pc.Index(PINECONE_INDEX_NAME)
    print("Serverless index is ready.")

    # --- Embed and Upsert Data ---
    print("Embedding and upserting data to Pinecone...")
    batch_size = 32
    for i in range(0, len(chunks), batch_size):
        i_end = min(i + batch_size, len(chunks))
        batch_chunks = chunks[i:i_end]
        
        result = genai.embed_content(
            model=EMBEDDING_MODEL,
            content=batch_chunks,
            task_type="RETRIEVAL_DOCUMENT"
        )
        dense_embeds = result['embedding']
        
        to_upsert = []
        for j, chunk in enumerate(batch_chunks):
            to_upsert.append({
                "id": f"chunk_{i+j}",
                "values": dense_embeds[j],
                "metadata": {"text": chunk}
            })
            
        index.upsert(vectors=to_upsert)
    
    print("Ingestion complete.")
    print(index.describe_index_stats())

Loading and processing PDF...
Chunking document by clauses...
Created 116 clause-based chunks.
Deleting existing index: rag-policy-serverless
Creating new SERVERLESS index: rag-policy-serverless
Serverless index is ready.
Embedding and upserting data to Pinecone...
Ingestion complete.
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {'': {'vector_count': 116}},
 'total_vector_count': 116,
 'vector_type': 'dense'}
