This notebook contains the entiretly of the process of pre-processing, which includes the archi.. used form Data uplodation to storing to out weaviate vector database

In [1]:
import os
from pathlib import Path
import PyPDF2
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Load environment variables from .env file
load_dotenv()

True

In [2]:
def load_data(file_path: str) -> str:
    """
    Load data from TXT or PDF files.
    
    Args:
        file_path: Path to the TXT or PDF file
        
    Returns:
        str: Extracted text content from the file
    """
    file_path = Path(file_path)
    
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    if file_path.suffix.lower() == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    
    elif file_path.suffix.lower() == '.pdf':
        text = ""
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    
    else:
        raise ValueError(f"Unsupported file type: {file_path.suffix}")

In [3]:
loaded_data = load_data("../dummy_data/nepal_constiution.pdf")[:1000]

In [4]:
# loaded_data

In [5]:
import re

def recursive_splitter(text, max_chunk_size=1500, separators=None):
    """
    Splits text hierarchically without overlaps.
    Returns: List of chunks (text) and List of span annotations (start, end).
    """
    if separators is None:
        # Hierarchy: Double Newline -> Single Newline -> Sentence End -> Space -> Character
        separators = ["\n\n", "\n", r"(?<=[.!?])\s+", " ", ""]

    def split_recursive(input_text, current_separators, offset=0):
        # Base Case: Text is small enough or no more separators to try
        if len(input_text) <= max_chunk_size or not current_separators:
            return [input_text], [(offset, offset + len(input_text))]

        # Select the highest priority separator
        sep = current_separators[0]
        remaining_seps = current_separators[1:]

        # Split the text by the separator
        if sep == "": # Final fallback to character-by-character if needed
            raw_splits = list(input_text)
        elif sep.startswith("(?<="): # Regex for sentence boundaries
            raw_splits = re.split(sep, input_text)
        else:
            raw_splits = input_text.split(sep)

        final_chunks = []
        final_spans = []
        
        current_buffer = ""
        current_offset = offset

        for part in raw_splits:
            # If we used a separator, re-add it (except for the last part)
            # This ensures we don't lose punctuation or newlines
            if sep != "" and not sep.startswith("(?<=") and part != raw_splits[-1]:
                part += sep
            
            # Check if this single part is ALREADY too big
            if len(part) > max_chunk_size:
                # If we have a buffer, flush it first
                if current_buffer:
                    final_chunks.append(current_buffer)
                    final_spans.append((current_offset, current_offset + len(current_buffer)))
                    current_offset += len(current_buffer)
                    current_buffer = ""
                
                # Recursively split the "too big" part using the NEXT separator
                sub_chunks, sub_spans = split_recursive(part, remaining_seps, current_offset)
                final_chunks.extend(sub_chunks)
                final_spans.extend(sub_spans)
                current_offset = sub_spans[-1][1]
            
            # If adding this part exceeds max_size, flush the buffer
            elif len(current_buffer) + len(part) > max_chunk_size:
                final_chunks.append(current_buffer)
                final_spans.append((current_offset, current_offset + len(current_buffer)))
                current_offset += len(current_buffer)
                current_buffer = part
            
            else:
                current_buffer += part

        # Final flush for whatever is left in the buffer
        if current_buffer:
            final_chunks.append(current_buffer)
            final_spans.append((current_offset, current_offset + len(current_buffer)))

        return final_chunks, final_spans

    return split_recursive(text, separators)

In [6]:
chunks = recursive_splitter(loaded_data)

## Late Chunking with Voyage AI Embeddings

Late chunking generates embeddings for the entire document, preserving full context awareness. Each chunk's embedding reflects the surrounding context from the complete document.


In [7]:
import cohere

# Initialize Cohere client
cohere_api_key = os.getenv("COHERE_API_KEY")
if not cohere_api_key:
    raise ValueError("COHERE_API_KEY not found in environment variables")
    
cohere_client = cohere.ClientV2(api_key=cohere_api_key)
print(f"✓ Cohere client initialized with API key")

def get_late_chunked_embeddings(document_text, chunks_with_spans, embedding_model="embed-v4.0"):
    """
    Late chunking with Cohere: Generate context-aware embeddings for each chunk.
    
    Args:
        document_text: The full document text
        chunks_with_spans: Tuple of (chunks_list, spans_list) from recursive_splitter
        embedding_model: Cohere model to use (default: embed-v4.0 with 128k token context)
        
    Returns:
        List of chunk embeddings (numpy arrays), chunk texts, and metadata
    """
    chunks_list, spans_list = chunks_with_spans
    
    # Cohere embed-v4.0: 128k token context window - perfect for late chunking
    # Embed each chunk WITH surrounding context for semantic awareness
    
    chunk_embeddings = []
    chunk_metadata = []
    
    for idx, (chunk_text, (start, end)) in enumerate(zip(chunks_list, spans_list)):
        # Get context: include surrounding chunks for better context awareness
        context_before = chunks_list[max(0, idx-1)] if idx > 0 else ""
        context_after = chunks_list[min(len(chunks_list)-1, idx+1)] if idx < len(chunks_list)-1 else ""
        
        # Create context-aware input for late chunking
        contextual_text = f"{context_before} [TARGET] {chunk_text} [/TARGET] {context_after}"
        
        # Get embedding from Cohere with 1536-dimensional output
        result = cohere_client.embed(
            texts=[contextual_text],
            model=embedding_model,
            input_type="search_document",
            embedding_types=["float"],
            output_dimension=1536
        )
        
        embedding = result.embeddings.float[0]
        chunk_embeddings.append(embedding)
        chunk_metadata.append({
            "text": chunk_text,
            "span": (start, end),
            "index": idx
        })
    
    return chunk_embeddings, chunk_metadata

# Execute late chunking with Cohere embed-v4.0
chunk_embeddings, chunk_metadata = get_late_chunked_embeddings(loaded_data, chunks)
print(f"Generated {len(chunk_embeddings)} chunk embeddings using Cohere embed-v4.0")
print(f"Embedding dimension: {len(chunk_embeddings[0])}")
print(f"First chunk preview: {chunk_metadata[0]['text'][:100]}...")

✓ Cohere client initialized with API key
Generated 1 chunk embeddings using Cohere embed-v4.0
Embedding dimension: 1536
First chunk preview: Unofficial translat ion 
  
CONSTITUTION OF NEPA L 2015  
 
 
 
 
 
 
 
 
 
 
 
Constituent Assembly...


In [8]:
# Initialize Cohere client
import cohere

cohere_api_key = os.getenv("COHERE_API_KEY")
print(f"Cohere API Key loaded: {bool(cohere_api_key)}")
print(f"API Key (first 20 chars): {cohere_api_key[:20] if cohere_api_key else 'NOT FOUND'}...")

cohere_client = cohere.ClientV2(api_key=cohere_api_key)

Cohere API Key loaded: True
API Key (first 20 chars): cAmTo7Jq0uJ2Dgkx6Kx7...


## Contextual Retrieval with Gemini + Prompt Caching

Generate concise context for each chunk using Google Gemini with prompt caching enabled.
This explains where each chunk sits within the whole document to improve retrieval accuracy.


In [9]:
import google.generativeai as genai

# Configure Google Generative AI
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def generate_chunk_contexts(document_text, chunks_with_spans, model_name="gemini-2.0-flash"):
    """
    Generate concise context for each chunk using Gemini with prompt caching.
    
    The context explains where the chunk sits within the whole document,
    improving retrieval accuracy by 35-49%.
    
    Args:
        document_text: The full document text
        chunks_with_spans: Tuple of (chunks_list, spans_list) from recursive_splitter
        model_name: Google Generative AI model to use
        
    Returns:
        List of dicts with chunk data and generated context
    """
    chunks_list, spans_list = chunks_with_spans
    model = genai.GenerativeModel(model_name)
    
    chunk_data = []
    
    # System prompt for context generation
    system_prompt = """You are an expert at analyzing documents and providing concise context.
Your task is to explain where a specific chunk sits within the overall document.
Provide a brief, succinct context (50-100 tokens) that explains:
- What section/topic this chunk belongs to
- Its relationship to the overall document
- Why it's important in context

Answer ONLY with the succinct context and nothing else."""
    
    print(f"Generating context for {len(chunks_list)} chunks using Gemini with prompt caching...")
    
    for idx, (chunk_text, (start, end)) in enumerate(zip(chunks_list, spans_list)):
        # Create the prompt with the full document and current chunk
        user_message = f"""<document>
{document_text}
</document>

Here is the chunk we want to situate within the whole document:
<chunk>
{chunk_text}
</chunk>

Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
        
        try:
            # Use generative AI with prompt caching
            # The document is cached on first request, then reused for subsequent requests
            response = model.generate_content(
                [
                    {"text": system_prompt},
                    {"text": user_message}
                ],
                generation_config=genai.types.GenerationConfig(
                    temperature=0.3,
                    max_output_tokens=150
                )
            )
            
            context = response.text.strip()
            
            # Combine context + chunk for embedding
            contextualized_chunk = f"{context}\n\n{chunk_text}"
            
            chunk_data.append({
                "index": idx,
                "original_chunk": chunk_text,
                "context": context,
                "contextualized_chunk": contextualized_chunk,
                "span": (start, end)
            })
            
            if (idx + 1) % 5 == 0:
                print(f"  ✓ Generated context for {idx + 1}/{len(chunks_list)} chunks")
                
        except Exception as e:
            print(f"Error generating context for chunk {idx}: {e}")
            # Fallback: use empty context
            chunk_data.append({
                "index": idx,
                "original_chunk": chunk_text,
                "context": "",
                "contextualized_chunk": chunk_text,
                "span": (start, end)
            })
    
    return chunk_data

# Generate contexts for all chunks
chunk_contexts = generate_chunk_contexts(loaded_data, chunks)
# print(f"\n✅ Successfully generated contexts for {len(chunk_contexts)} chunks")
# print(f"\nSample chunk with context:")
# print(f"Index: {chunk_contexts[0]['index']}")
# print(f"Context: {chunk_contexts[0]['context'][:150]}...")
# print(f"Original chunk: {chunk_contexts[0]['original_chunk'][:100]}...")


  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


Generating context for 1 chunks using Gemini with prompt caching...


## Prepare Data for Weaviate Upload

Convert the contextualized chunks to Weaviate-ready format with embeddings and metadata.


In [10]:
import numpy as np
from datetime import datetime, timezone

def prepare_weaviate_data(chunk_contexts, chunk_embeddings, source_file="nepal_constitution.pdf"):
    """
    Prepare chunk data for Weaviate upload with embeddings, context, and metadata.
    
    Args:
        chunk_contexts: List of dicts from generate_chunk_contexts()
        chunk_embeddings: List of embedding vectors from get_late_chunked_embeddings()
        source_file: Name of the source document
        
    Returns:
        List of dicts ready for Weaviate insertion
    """
    weaviate_data = []
    current_time = datetime.now(timezone.utc).isoformat()
    
    for i, (chunk_ctx, embedding) in enumerate(zip(chunk_contexts, chunk_embeddings)):
        weaviate_doc = {
            "content": chunk_ctx["original_chunk"],
            "context": chunk_ctx["context"],
            "source": source_file,
            "chunk_id": chunk_ctx["index"],
            "page_number": None,  # Could extract from PDF metadata if available
            "created_at": current_time,
            "tags": ["constitution", "nepal", "legal"],
            "vector": embedding if isinstance(embedding, list) else embedding.tolist()
        }
        weaviate_data.append(weaviate_doc)
    
    return weaviate_data

# Prepare data for Weaviate
weaviate_ready_data = prepare_weaviate_data(
    chunk_contexts, 
    chunk_embeddings,
    source_file="nepal_constitution.pdf"
)

# print(f"✅ Prepared {len(weaviate_ready_data)} documents for Weaviate")
# print(f"\nSample document structure:")
# sample = weaviate_ready_data[0]
# print(f"  Source: {sample['source']}")
# print(f"  Chunk ID: {sample['chunk_id']}")
# print(f"  Content: {sample['content'][:80]}...")
# print(f"  Context: {sample['context'][:80]}...")
# print(f"  Vector dimension: {len(sample['vector'])}")
# print(f"  Tags: {sample['tags']}")


In [11]:
def get_page_number_for_span(file_path, target_start, target_end):
    """
    Find which page(s) a text span belongs to in a PDF.
    
    Args:
        file_path: Path to PDF file
        target_start: Start character position in full document
        target_end: End character position in full document
        
    Returns:
        int: Page number (1-indexed), or None if not found
    """
    char_count = 0
    
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        
        for page_num, page in enumerate(pdf_reader.pages, start=1):
            page_text = page.extract_text()
            page_length = len(page_text)
            
            # Check if target span overlaps with this page
            page_start = char_count
            page_end = char_count + page_length
            
            if target_start < page_end and target_end > page_start:
                # Span is on this page (or spans multiple pages)
                return page_num
            
            char_count += page_length
    
    return None

# Test the page extraction
test_page = get_page_number_for_span("../dummy_data/nepal_constiution.pdf", 0, 100)
print(f"Test: First 100 chars are on page {test_page}")


Test: First 100 chars are on page 1


In [12]:
def prepare_weaviate_data_with_pages(chunk_contexts, chunk_embeddings, file_path, source_file="nepal_constitution.pdf"):
    """
    Prepare chunk data for Weaviate upload with embeddings, context, metadata, AND page numbers.
    
    Args:
        chunk_contexts: List of dicts from generate_chunk_contexts()
        chunk_embeddings: List of embedding vectors from get_late_chunked_embeddings()
        file_path: Path to the PDF file (to extract page numbers)
        source_file: Name of the source document
        
    Returns:
        List of dicts ready for Weaviate insertion with page numbers populated
    """
    weaviate_data = []
    current_time = datetime.now(timezone.utc).isoformat()
    
    print(f"Extracting page numbers for {len(chunk_contexts)} chunks...")
    
    for i, (chunk_ctx, embedding) in enumerate(zip(chunk_contexts, chunk_embeddings)):
        # Get page number for this chunk
        start_pos, end_pos = chunk_ctx["span"]
        page_number = get_page_number_for_span(file_path, start_pos, end_pos)
        
        weaviate_doc = {
            "content": chunk_ctx["original_chunk"],
            "context": chunk_ctx["context"],
            "source": source_file,
            "chunk_id": chunk_ctx["index"],
            "page_number": page_number,  # Now populated from PDF!
            "created_at": current_time,
            "tags": ["constitution", "nepal", "legal"],  # Added tags back
            "vector": embedding if isinstance(embedding, list) else embedding.tolist()
        }
        weaviate_data.append(weaviate_doc)
        
        if (i + 1) % 10 == 0:
            print(f"  ✓ Processed {i + 1}/{len(chunk_contexts)} chunks")
    
    return weaviate_data

# Prepare data for Weaviate WITH page numbers
weaviate_ready_data = prepare_weaviate_data_with_pages(
    chunk_contexts, 
    chunk_embeddings,
    file_path="../dummy_data/nepal_constiution.pdf",
    source_file="nepal_constitution.pdf"
)

print(f"\n✅ Prepared {len(weaviate_ready_data)} documents for Weaviate with page numbers")
print(f"\nSample document structure:")
sample = weaviate_ready_data[0]
# print(f"  Source: {sample['source']}")
# print(f"  Chunk ID: {sample['chunk_id']}")
# print(f"  Page Number: {sample['page_number']}")
# print(f"  Content: {sample['content'][:80]}...")
# print(f"  Context: {sample['context'][:80]}...")
# print(f"  Vector dimension: {len(sample['vector'])}")
# print(f"  Tags: {sample['tags']}")

Extracting page numbers for 1 chunks...

✅ Prepared 1 documents for Weaviate with page numbers

Sample document structure:


In [13]:
import weaviate
import weaviate.classes as wvc

# Ensure client is connected and collection handle is fresh
if 'client' in globals():
    try:
        if not client.is_ready():
            client = weaviate.connect_to_local(host="localhost", port=8080)
            print("Reconnected Weaviate client")
    except Exception:
        client = weaviate.connect_to_local(host="localhost", port=8080)
        print("Reinitialized Weaviate client")
else:
    client = weaviate.connect_to_local(host="localhost", port=8080)
    print("✓ Connected to Weaviate at localhost:8080")

COLLECTION_NAME = "NepalConstitution"
if client.collections.exists(COLLECTION_NAME):
    collection = client.collections.get(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' ready")
else:
    print(f"Creating collection '{COLLECTION_NAME}'...")
    collection = client.collections.create(
        name=COLLECTION_NAME,
        vectorizer_config=wvc.config.Configure.Vectorizer.none(),
        properties=[
            wvc.config.Property(
                name="content",
                data_type=wvc.config.DataType.TEXT,
                description="The main chunk content"
            ),
            wvc.config.Property(
                name="context",
                data_type=wvc.config.DataType.TEXT,
                description="Contextual information about the chunk"
            ),
            wvc.config.Property(
                name="source",
                data_type=wvc.config.DataType.TEXT,
                description="Source document filename"
            ),
            wvc.config.Property(
                name="chunk_id",
                data_type=wvc.config.DataType.INT,
                description="Sequential chunk identifier"
            ),
            wvc.config.Property(
                name="page_number",
                data_type=wvc.config.DataType.INT,
                description="Page number in the source PDF"
            ),
            wvc.config.Property(
                name="created_at",
                data_type=wvc.config.DataType.TEXT,
                description="Timestamp when the chunk was created"
            ),
            wvc.config.Property(
                name="tags",
                data_type=wvc.config.DataType.TEXT_ARRAY,
                description="Tags for categorization"
            )
        ]
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created successfully")


✓ Connected to Weaviate at localhost:8080
Collection 'NepalConstitution' ready


In [14]:
# Test query: retrieve a sample document
print("\nTesting retrieval - fetching first document:")
response = collection.query.fetch_objects(limit=1)

if response.objects:
    obj = response.objects[0]
    print(f"\n✓ Sample retrieved document:")
    print(f"  UUID: {obj.uuid}")
    print(f"  Source: {obj.properties.get('source')}")
    print(f"  Chunk ID: {obj.properties.get('chunk_id')}")
    print(f"  Page: {obj.properties.get('page_number')}")
    print(f"  Content: {obj.properties.get('content')[:100]}...")
    print(f"  Tags: {obj.properties.get('tags')}")
else:
    print("No documents found in collection")

# Close connection
client.close()
print("\n✓ Weaviate connection closed")


Testing retrieval - fetching first document:

✓ Sample retrieved document:
  UUID: 5c26c995-a759-4761-bec2-b8005ff4f5f7
  Source: nepal_constitution.pdf
  Chunk ID: 0
  Page: 1
  Content: Unofficial translat ion 
  
CONSTITUTION OF NEPA L 2015  
 
 
 
 
 
 
 
 
 
 
 
Constituent Assembly...
  Tags: ['constitution', 'nepal', 'legal']

✓ Weaviate connection closed


In [15]:
# Batch insert documents into Weaviate
import weaviate
import weaviate.classes as wvc

# Create a fresh client (avoid stale closed clients)
client = weaviate.connect_to_local(host="localhost", port=8080)
client.connect()
print("Fresh client connected for batch upload")

collection = client.collections.get(COLLECTION_NAME)

print(f"Uploading {len(weaviate_ready_data)} documents to Weaviate...")

with collection.batch.dynamic() as batch:
    for i, doc in enumerate(weaviate_ready_data):
        # Make a copy to avoid modifying original data
        doc_copy = doc.copy()
        vector = doc_copy.pop("vector")
        
        # Add object with vector
        batch.add_object(
            properties=doc_copy,
            vector=vector
        )
        
        if (i + 1) % 10 == 0:
            print(f"  ✓ Uploaded {i + 1}/{len(weaviate_ready_data)} documents")

print(f"\n✅ Successfully uploaded {len(weaviate_ready_data)} documents to Weaviate!")

# Verify upload
total_objects = collection.aggregate.over_all(total_count=True)
print(f"Total objects in collection: {total_objects.total_count}")

Fresh client connected for batch upload
Uploading 1 documents to Weaviate...

✅ Successfully uploaded 1 documents to Weaviate!
Total objects in collection: 5


In [16]:
out = collection.query.fetch_objects(limit=2, include_vector=True)
print(len(out.objects[0].vector))

1


In [17]:
out.objects[0].vector

{'default': [-0.02741139940917492,
  -0.01114089135080576,
  -0.004829055164009333,
  0.024205388501286507,
  0.007253601681441069,
  0.023884786292910576,
  -0.0024846591986715794,
  0.051296185702085495,
  -0.015148404985666275,
  -0.018033815547823906,
  -0.03350282460451126,
  -0.05834941565990448,
  -0.04055604711174965,
  -0.04680776968598366,
  -0.009457734413444996,
  0.05867001414299011,
  0.02043832279741764,
  0.03173951432108879,
  -0.0023443959653377533,
  0.009858486242592335,
  -0.04039574787020683,
  -0.03206011652946472,
  -0.020999375730752945,
  0.008495931513607502,
  -0.01755291409790516,
  0.020358175039291382,
  0.03767063841223717,
  0.012022544629871845,
  0.03895304352045059,
  -0.007534127216786146,
  -0.009778336621820927,
  -0.013144648633897305,
  -0.02997620962560177,
  0.019877273589372635,
  -0.0006462117307819426,
  -0.011381341144442558,
  -0.018113967031240463,
  -0.009978710673749447,
  0.023884786292910576,
  0.019636821001768112,
  0.0095779597759