# importing libraries

In [62]:
pip install faiss-cpu

In [3]:
from llama_index.core.node_parser import SentenceSplitter
import numpy as np
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import faiss

# Document handling

### doc loader

In [4]:
class doc_loader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_text(self):
        with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
        return text

### doc splitter

In [5]:
class doc_splitter:
    def __init__(self, chunk_size=512, chunk_overlap= 16):
        self.splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, paragraph_separator="/n/n/n/n")

    def split(self, text):
        chunks = self.splitter.split_text(text)
        return chunks

# Embedding Handling

In [27]:
class embed_model_class:
    def __init__(self, embed_model_name):
        self.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

    def get_embedding(self, chunks):
        embedding = [self.embed_model.get_text_embedding(chunk) for chunk in chunks]
        return np.array(embedding)

# FAISS Indexing

In [34]:
class Faiss_indexing:
    def __init__(self, dimension):
        self.index = faiss.IndexFlatL2(dimension)

    def indexing(self, embeddings):
        embeddings = np.array(embeddings, dtype=np.float32)

        if embeddings.shape[1] != self.index.d:
            raise ValueError(f"Embedding dimension mismatch: Expected {self.index.d}, got {embeddings.shape[1]}")
        self.index.add(embeddings)

# Vector Store Abstraction

In [65]:
class FaissVectorStore:
    def __init__(self, file_path, embed_model, chunk_size=512, chunk_overlap=16):
        # Instead of creating new instances of doc_loader, doc_splitter, embed_model_class inside the class,
        # we pass the instances to the constructor.
        self.document_loader = doc_loader(file_path)
        self.document_splitter = doc_splitter(chunk_size, chunk_overlap)
        self.embedding_model = embed_model
        self.index = None

    def build_index(self):
        # Use the passed instances directly
        text = self.document_loader.load_text()
        self.chunks = self.document_splitter.split(text)

        # Generate embeddings for the chunks using the passed embed_model instance
        embeddings = self.embedding_model.get_embedding(chunks)

        # Create a FAISS index with the appropriate dimension (based on embedding size)
        dimension = embeddings.shape[1]
        self.index = Faiss_indexing(dimension)
        
        # Add the embeddings to the FAISS index
        self.index.indexing(embeddings)

    def search(self, query, k=5):
        # Convert the query into an embedding using the passed embed_model instance
        query_embedding = self.embedding_model.get_embedding([query])

        # Perform a similarity search using the FAISS index
        distances, indices = self.index.index.search(query_embedding, k)
        return distances, indices


    def get_text_by_index(self, idx):
        """Retrieve the text chunk based on the index."""
        # Ensure the index is valid and return the corresponding chunk
        if idx < len(self.chunks):
            return self.chunks[idx]
        else:
            raise IndexError("Index out of range")

In [66]:
# Initialize the components outside of FaissVectorStore
file_path = "data/pg10763.txt"
embed_model = embed_model_class("BAAI/bge-small-en-v1.5")

# Pass the created instances to the FaissVectorStore constructor
vector_store = FaissVectorStore(file_path=file_path, embed_model=embed_model)

# Build the index (this will load, split, embed, and index the document)
vector_store.build_index()

# Example query for searching
query = "What do the Sikh Stoics believe?"

# Perform similarity search for the query
distances, indices = vector_store.search(query, k=5)

# Print the results
print(f"Distances: {distances}")
print(f"Indices: {indices}")


index_to_retrieve = 147
chunk_text = vector_store.get_text_by_index(index_to_retrieve)

# Print the chunk text
print(f"Text for index {index_to_retrieve}: {chunk_text}")

Distances: [[0.8233491  0.85043573 0.8664963  0.8756935  0.87710845]]
Indices: [[147 144  64 152 151]]
Text for index 147: _John Keats._




PRAISE THE GENEROUS GODS FOR GIVING


Some of us find joy in toil, some in art, some in the open air and the
sunshine. All of us find it in simply being alive. Life is the gift no
creature in his right mind would part with. As Milton asks,

  "For who would lose,
  Though full of pain, this intellectual being,
  These thoughts that wander through eternity,
  To perish rather, swallowed up and lost
  In the wide womb of uncreated night,
  Devoid of sense and motion?"


  Praise the generous gods for giving
    In a world of wrath and strife,
  With a little time for living,
    Unto all the joy of life.

  At whatever source we drink it,
    Art or love or faith or wine,
  In whatever terms we think it,
    It is common and divine.

  Praise the high gods, for in giving
    This to man, and this alone,
  They have made his chance of living
    Shin

# Practice

In [29]:
file_path = "data/pg10763.txt"
doc_reader = doc_loader(file_path)
text = doc_reader.load_text()  
print(text) 


﻿The Project Gutenberg eBook of It can be done
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: It can be done
        Poems of inspiration

Compiler: Joseph Morris
        St. Clair Adams

Release date: January 1, 2004 [eBook #10763]
                Most recently updated: December 20, 2020

Language: English

Credits: Produced by Juliet Sutherland, Anne Folland and PG Distributed Proofreaders


*** START OF THE PROJECT GUTENBERG EBOOK IT CAN BE DONE ***




Produced by Juliet Sutherland, Anne Folland and PG Distributed
Proofreaders





IT CAN BE DONE

POEMS OF INSPIRATION


COLLECTED 

In [30]:
doc_inst = doc_splitter()
chunks = doc_inst.split(text)


In [31]:
embed_model = embed_model_class("BAAI/bge-small-en-v1.5")

embeddings = embed_model.get_embedding(chunks=chunks)
embeddings

array([[-0.02588425,  0.01224917,  0.0348541 , ...,  0.04839606,
         0.12706842,  0.00730502],
       [-0.01626342,  0.01742641,  0.03092728, ..., -0.0106948 ,
         0.10914728,  0.00412368],
       [-0.03875948,  0.02392979,  0.04444739, ..., -0.0327999 ,
         0.02678536,  0.01160013],
       ...,
       [-0.06825387, -0.02277294,  0.01025582, ...,  0.00395815,
         0.04362784, -0.0071354 ],
       [-0.03219054,  0.01408754, -0.03777403, ..., -0.04092974,
         0.04049606, -0.02535254],
       [-0.05763495, -0.00284349, -0.02489776, ..., -0.04097775,
         0.08121408, -0.0188564 ]])

In [32]:
print(embeddings.shape)


(211, 384)


In [53]:
index = Faiss_indexing(384)
index.indexing(embeddings)


query_embedding = np.random.rand(1, 384).astype(np.float32)

# Now use the search function with the index
distances, indices = search(indexin.index, query_embedding)

# Print the results
print("Distances:", distances)
print("Indices:", indices)

Distances: [[127.045876 127.045876 127.239586 127.239586 127.265656]]
Indices: [[ 95 306 175 386   8]]
