In [4]:
import os
import google.generativeai as genai
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import Chroma
from langchain.document_loaders import TextLoader, DirectoryLoader
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI

In [5]:
load_dotenv()

True

In [6]:
# Configure API keys
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
KNOWLEDGE_BASE = "/home/olande/Desktop/Rag_Techniques/Contextual Chunk Headers/books"
# Configure constants
EMBEDDING_MODEL = 'text-embedding-004'  

LLM_MODEL = 'meta-llama/llama-3.1-8b-instruct'  

In [7]:
# Initialize Gemini for embeddings
genai.configure(api_key=GEMINI_API_KEY)


In [8]:
llm = ChatOpenAI(
    model=LLM_MODEL,
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1",
    temperature=0.7
)

In [9]:
loader = DirectoryLoader(KNOWLEDGE_BASE, glob = "**/*.txt", show_progress = True, loader_cls = TextLoader)
documents = loader.load()

100%|██████████| 2/2 [00:00<00:00, 834.19it/s]


In [20]:
from langchain.embeddings.base import Embeddings
from tqdm.notebook import tqdm

class GeminiEmbeddings(Embeddings):
    def __init__(self, api_key=None):
        if api_key:
            genai.configure(api_key=api_key)
        
    def embed_documents(self, texts):
        """Embed a list of documents using Gemini API"""
        embeddings = []
        for text in tqdm(texts, desc = "Embedding documents"):
            response = genai.embed_content(
                model="text-embedding-004",
                content=text,
                task_type="RETRIEVAL_DOCUMENT"
            )
            embeddings.append(response['embedding'])
        return embeddings
    
    def embed_query(self, text):
        """Embed a query using Gemini API"""
        response = genai.embed_content(
            model="text-embedding-004",
            content=text,
            task_type="RETRIEVAL_QUERY"
        )
        return response['embedding']

# Create the embeddings model
embeddings_model = GeminiEmbeddings(api_key=GEMINI_API_KEY)

# Now create the semantic chunker with the embeddings model
text_splitter = SemanticChunker(embeddings=embeddings_model, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)

# Split the documents
print("Splitting documents...")
chunks = text_splitter.split_documents(documents)
print(f"Split documents into {len(chunks)} chunks")

Splitting documents...


Embedding documents:   0%|          | 0/830 [00:00<?, ?it/s]

Embedding documents:   0%|          | 0/38 [00:00<?, ?it/s]

Split documents into 89 chunks


In [26]:
embeddings = GeminiEmbeddings(api_key=GEMINI_API_KEY)
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Save the vectorstore
vectorstore.save_local("faiss_index")

Embedding documents:   0%|          | 0/89 [00:00<?, ?it/s]

In [27]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

In [28]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


In [29]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
chain.invoke("Tell me about Thomas Jefferson")

'Thomas Jefferson was the author of the Declaration of Independence of the United States of America.'

# Enter RAGFusion