# importing libraries

In [1]:
pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [2]:
from llama_index.core.node_parser import SentenceSplitter
import numpy as np
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import faiss
from llama_index.llms.cohere import Cohere
import cohere
from cohere import Client

In [3]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

In [4]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

# Document handling

### doc loader

In [5]:
class doc_loader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_text(self):
        with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
        return text

### doc splitter

In [6]:
class doc_splitter:
    def __init__(self, chunk_size=512, chunk_overlap= 16):
        self.splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, paragraph_separator="/n/n/n/n")

    def split(self, text):
        chunks = self.splitter.split_text(text)
        return chunks

# Embedding Handling

In [7]:
class embed_model_class:
    def __init__(self, embed_model_name):
        self.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

    def get_embedding(self, chunks):
        embedding = [self.embed_model.get_text_embedding(chunk) for chunk in chunks]
        return np.array(embedding)

# FAISS Indexing

In [8]:
class Faiss_indexing:
    def __init__(self, dimension):
        self.index = faiss.IndexFlatL2(dimension)

    def indexing(self, embeddings):
        embeddings = np.array(embeddings, dtype=np.float32)

        if embeddings.shape[1] != self.index.d:
            raise ValueError(f"Embedding dimension mismatch: Expected {self.index.d}, got {embeddings.shape[1]}")
        self.index.add(embeddings)

# Creating Query Engine

In [9]:
api_key = CO_API_KEY
cohere_client = Client(api_key)

class Query_engine:
    def __init__(self, llm_model="command-r-plus", apikey=CO_API_KEY):
        # Initialize the Cohere client with the provided API key
        self.llm = cohere.Client(apikey)
        self.model = llm_model

    def query(self, search_content, query):
        """
        Takes the searched similar content (text chunk) and the original query,
        and processes them using the LLM to generate a coherent response.
        """
        context = "\n".join(search_content)  # Join search content into a single string for context

        try:
            # Query the LLM model with the provided context and query
            response = self.llm.generate(
                model=self.model,  # Use the provided model
                prompt=f"Query: {query}\nContext: {context}",
                max_tokens=150,
                temperature=0.7
            )
            
            # Return the LLM response text
            return response.generations[0].text  # Adjust this to match the response structure of the `cohere` client

        except Exception as e:
            # If an error occurs, print it and return a friendly message
            print(f"Error querying LLM: {e}")
            return "Sorry, there was an error processing your request."


# Vector Store Abstraction

In [10]:
class FaissVectorStore:
    def __init__(self, file_path, embed_model, chunk_size=512, chunk_overlap=16):
        # Instead of creating new instances of doc_loader, doc_splitter, embed_model_class inside the class,
        # we pass the instances to the constructor.
        self.document_loader = doc_loader(file_path)
        self.document_splitter = doc_splitter(chunk_size, chunk_overlap)
        self.embedding_model = embed_model
        self.index = None

    def build_index(self):
        # Use the passed instances directly
        text = self.document_loader.load_text()
        self.chunks = self.document_splitter.split(text)

        # Generate embeddings for the chunks using the passed embed_model instance
        embeddings = self.embedding_model.get_embedding(self.chunks)

        # Create a FAISS index with the appropriate dimension (based on embedding size)
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        
        embeddings = np.array(embeddings, dtype=np.float32)
        self.index.add(embeddings)

    def search(self, query, k=5):
        # Convert the query into an embedding using the passed embed_model instance
        query_embedding = self.embedding_model.get_embedding([query])

        # Perform a similarity search using the FAISS index
        distances, indices = self.index.search(query_embedding, k)
        return distances, indices


    def get_text_by_index(self, idx):
        """Retrieve the text chunk based on the index."""
        # Ensure the index is valid and return the corresponding chunk
        if idx < len(self.chunks):
            return self.chunks[idx]
        else:
            raise IndexError("Index out of range")

In [11]:
# Initialize the components outside of FaissVectorStore
file_path = "data/pg10763.txt"
embed_model = embed_model_class("BAAI/bge-small-en-v1.5")

# Pass the created instances to the FaissVectorStore constructor
vector_store = FaissVectorStore(file_path=file_path, embed_model=embed_model)

# Build the index (this will load, split, embed, and index the document)
vector_store.build_index()

# Example query for searching
query = "What do the Sikh Stoics believe?"

# Perform similarity search for the query
distances, indices = vector_store.search(query, k=5)

# Print the results
print(f"Distances: {distances}")
print(f"Indices: {indices}")


index_to_retrieve = 147
chunk_text = vector_store.get_text_by_index(index_to_retrieve)

# Print the chunk text
print(f"\n\nText for index {index_to_retrieve}: {chunk_text}")

Distances: [[0.8233491  0.85043573 0.8664963  0.8756935  0.87710845]]
Indices: [[147 144  64 152 151]]


Text for index 147: _John Keats._




PRAISE THE GENEROUS GODS FOR GIVING


Some of us find joy in toil, some in art, some in the open air and the
sunshine. All of us find it in simply being alive. Life is the gift no
creature in his right mind would part with. As Milton asks,

  "For who would lose,
  Though full of pain, this intellectual being,
  These thoughts that wander through eternity,
  To perish rather, swallowed up and lost
  In the wide womb of uncreated night,
  Devoid of sense and motion?"


  Praise the generous gods for giving
    In a world of wrath and strife,
  With a little time for living,
    Unto all the joy of life.

  At whatever source we drink it,
    Art or love or faith or wine,
  In whatever terms we think it,
    It is common and divine.

  Praise the high gods, for in giving
    This to man, and this alone,
  They have made his chance of living
    Sh

# responese

In [12]:
response_class = Query_engine()
response = response_class.query(chunk_text, query)
response

'Sikhs believe in one God and that the purpose of life is to learn to love God.'

# Running full Agian

In [13]:
# Initialize the components outside of FaissVectorStore
file_path = "data/pg10763.txt"
embed_model = embed_model_class("BAAI/bge-small-en-v1.5")

# Pass the created instances to the FaissVectorStore constructor
vector_store = FaissVectorStore(file_path=file_path, embed_model=embed_model)

# Build the index (this will load, split, embed, and index the document)
vector_store.build_index()

# Example query for searching
query = "What poems by Rudyard Kipling are in this book?"

# Perform similarity search for the query
distances, indices = vector_store.search(query, k=5)

indices_flatterend = indices.flatten() if len(indices.shape) >1 else indices

context = []

for idx in  indices_flatterend:
    try:
        chunk_text = vector_store.get_text_by_index(idx)
        context.append(chunk_text)
    except IndexError:
        print(f"Error: Index {idx} out of range.")

response_class = Query_engine()
response = response_class.query(context, query)
response

'"If" and "When Earth\'s Last Picture Is Painted"'