In [2]:
import google.generativeai as genai
import os
import warnings
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.messages import HumanMessage
from datetime import datetime
from typing import List, Dict, Any, Optional
import uuid
from pinecone import Pinecone, ServerlessSpec
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity

# Suppress TensorFlow warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()

def get_api_key(key_name="OPENROUTER_API_KEY"): 
    """
    Get API key from environment variables
    """
    api_key = os.getenv(key_name)
    if not api_key:
        raise ValueError(f"Invalid API key: {key_name} not found in environment variables")
    return api_key

def initialize_llm(model_name="meta-llama/llama-3.1-8b-instruct",
                  temperature=0.4,
                  use_streaming=True):
    """
    Initialize LLM
    """
    api_key = get_api_key()
    callbacks = [StreamingStdOutCallbackHandler()]
    llm = ChatOpenAI(
        model_name=model_name,
        temperature=temperature,
        streaming=use_streaming,
        callbacks=callbacks,
        openai_api_key=api_key,
        openai_api_base="https://openrouter.ai/api/v1"
    )
    return llm

llm = initialize_llm()

In [3]:
def initialize_google_embedding_model():
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in environment variables")

    genai.configure(api_key=api_key)
    # Get the text-embedding-004 model
    embedding_model = "text-embedding-004"
    return embedding_model

In [5]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Optional

def load_documents(data_dir: str, file_extension:str = ".txt") -> List:
    loader = DirectoryLoader(data_dir,
            glob = "**/*.txt",
            loader_cls = TextLoader, 
            show_progress = True)
    documents = loader.load()
    print("\nLoaded {len(documents)} documents from {data_dir}")
    return documents

In [6]:
def split_documents(documents: List, chunk_size: int = 2000, chunk_overlap: int = 200) -> List:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        is_separator_regex = False,
    )

    chunks = text_splitter.split_documents(documents)
    print(f"\nSplit into {len(chunks)} chunks")
    return chunks

In [12]:
def generate_contextual_headers(chunk, llm, doc_metadata = Optional[None]) -> List:
    # Extract the document metadata if any
    doc_title = doc_metadata.get("title", "Unknown Document") if doc_metadata else "Unknown Document"
    source = chunk.metadata.get("source", "Unknown Source")

    # Create a prompt for the LLM to generate a header
    prompt = f"""
        You are an expert in document analysis and information synthesis. Your task is to create a concise, informative header for a text excerpt that will help readers quickly understand its significance and context.

        Document: "{doc_title}" 
        Source: {source}
        Text excerpt (first 500 characters): 
        {chunk.page_content[:500]}...

        Please create a brief header (1-2 sentences, maximum 20 words) that:
        1. Identifies the main topic or concept discussed in this excerpt
        2. Provides essential context about how this section relates to the broader document
        3. Includes any key terms, names, or technical concepts that are central to this passage
        4. Uses clear, precise language appropriate to the document's domain/field

        Your header should be immediately useful to someone scanning through document chunks, helping them quickly determine relevance and understand the excerpt's place in the overall document structure.

        CONTEXTUAL HEADER:
        """

    # Generate the header
    header = llm.invoke([HumanMessage(content = prompt)]).content.strip()
    return header

In [15]:
def create_chunks_with_headers(chunks, llm):
    chunks_with_headers = []
    for i, chunk in enumerate(chunks):
        header = generate_contextual_header(chunk, llm, doc_metadata = chunk.metadata)

        # Create new chunk with header
        enriched_content = f"CONTEXT: {header}\n\nCONTENT: {chunk.page_content}"
        # Create a new document with the same metadata but enhanced content
        chunk.page_content = enriched_content
        chunks_with_headers.append(chunk)
        # Print progress
        if (i+1) % 10 == 0:
            print(f"Processed {i+1}/{len(chunks)} chunks")
    
    return chunks_with_headers

In [16]:
def get_google_embeddings(texts, embedding_model = "text-embedding-004"):
    if not embedding_model:
        embedding_model = initialize_google_embedding_model()

    # Create batch request
    embeddings = []
    for text in texts:
        # Get embedding for the text
        result = genai.embed_content(
            model=embedding_model,
            content=text,
            task_type="retrieval_document",  # Optimize for retrieval
            title="Document chunk"
        )

        # Extract the embedding
        embedding = result["embedding"]

        embeddings.append(embedding)

    return embeddings

In [17]:
def initialize_pinecone(index_name = "rag_comparison"):
    api_key = os.getenv("PINECONE_API_KEY")
    if not api_key:
        raise ValueError("PINECONE_API_KEY not found in environment variables")
    
    # Initialize Pinecone Client
    pc = Pinecone(api_key = api_key)

    # Check if the index already exists, create if it doesnt
    if index_name not in pc.list_indexes().names():
        # Create the index
        pc.create_index(
            name = index_name,
            dimension = 768,
            metric = "cosine",
            spec = ServerlessSpec(cloud="aws", region="us-east-1")
        )

        print(f"Created new Pinecone index: {index_name}")
    
    # Connect to index
    index = pc.Index(index_name)
    print(f"Connected to Pinecone index: {index_name}")
    
    return index

In [19]:
def upsert_to_pinecone(chunks, index, namespace = "standard", embedding_model = Optional[None]):
    if not embedding_model:
        embedding_model = initialize_google_embedding_model()

    # Batch processing
    batch_size = 20
    total_chunks = len(chunks_with_headers)
    for i in range(0, total_chunks, batch_size):
        end_idx = min(i + batch_size, total_chunks)
        batch = chunks_with_headers[i:end_idx]
        
        # Extract text content from batch
        texts = [chunk.page_content for chunk in batch]
        
        # Generate embeddings
        embeddings = get_google_embeddings(texts, embedding_model)
        
        # Prepare vectors for Pinecone
        vectors = []
        for j, (chunk, embedding) in enumerate(zip(batch, embeddings)):
            # Create a unique ID for each vector
            vector_id = str(uuid.uuid4())
            
            # Prepare metadata
            metadata = {
                "text": chunk.page_content,
                "source": chunk.metadata.get("source", "Unknown"),
                "chunk_id": i + j,
                "namespace": namespace
            }
            
            # Add vector to batch
            vectors.append({
                "id": vector_id,
                "values": embedding,
                "metadata": metadata
            })
        
        # Upsert vectors to Pinecone
        index.upsert(vectors=vectors, namespace=namespace)
        
        print(f"Processed and upserted {end_idx}/{total_chunks} chunks")
    
    print(f"Successfully upserted {total_chunks} contextual chunks to Pinecone")

In [None]:
def semantic_search(query, index, namespace = "standard", top_k = 5, embedding_model = Optional[None]):
    if not embedding_model:
        embedding_model = initialize_google_embedding_model()

    # Generate query embeddings
    # Generate embedding for the query
    query_embedding_result = genai.embed_content(
        model=embedding_model,
        content=query,
        task_type="retrieval_query"  # Optimize for query
    )
    query_embedding = query_embedding_result["embedding"]
     # Search Pinecone
    results = index.query(
        vector=query_embedding,
        namespace=namespace,
        top_k=top_k,
        include_metadata=True
    )
    
    return results
