In [17]:
import os
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain_community.llms import Ollama
from semantic_chunkers import StatisticalChunker


In [18]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')




In [19]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
MODEL = "mistral"  
model = Ollama(model=MODEL)


In [20]:
CHUNK_OP_DIRECTORY_TXT = ".\\output\\chunks"
FILE_PATH = ".\output\\text\\Metamorphosis by Franz Kafka.txt"


TEXT_FILE_NAME = os.path.splitext(os.path.basename(FILE_PATH))[0]
EMBEDDINGS_PATH  =f".\model_embeddings\{TEXT_FILE_NAME}_embeddings.pkl"

FAISS_INDEX  =f".\model_embeddings\{TEXT_FILE_NAME}_faiss.index"
FAISS_INDEX


'.\\model_embeddings\\Metamorphosis by Franz Kafka_faiss.index'

In [33]:

def format_output(context, question):
    """
    Use Mistral model to generate formatted output.
    """
    template = f"""
    >>> POINTS TO REMEMBER BEFORE GENERATING THE OUTPUT
        CONSIDER YOU ARE A CHATBOT WITH NO KNOWLEDGE.
        YOU WILL GAIN KNOWLEDGE ONLY WITH THE INFORMATION/CONTEXT I GIVE YOU.
        DON'T TRY TO ANSWER OUTSIDE OF THE INFORMATION I GIVE YOU.
        GENERATE THE OUTPUTS IN A STRUCTURED MANNER.
        IF THE ANSWER TO THE QUESTION IS OUT OF THE CONTEXT, THEN RETURN THAT "THE CONTEXT IS OUT OF THE KNOWLWDGE. NO RELEVANT INFORMATION FOUND"

    >>> INFORMATION/CONTEXT : {context}
    >>> QUERY : {question}

    
    

    
    """
    

    prompt_text = template.format(context=context, question=question)

    response = model(prompt_text)
    return response

def search_faiss(query, index, model, k=5):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)  
    return I[0]  

def retrieve_and_format_results(query, index, text_chunks, model):
    indices = search_faiss(query, index, model)
    
    if not indices.size:
        return "No relevant information found."

    valid_indices = [i for i in indices if 0 <= i < len(text_chunks)]
    results = " ".join([text_chunks[i] for i in valid_indices]) 
    print(f"\n---------------------------------------------------------------\{results}---------------------------------------------")
    formatted_results = format_output(results ,query)
    return formatted_results




In [34]:
def load_embeddings(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

In [35]:
text_chunks, embeddings = load_embeddings(EMBEDDINGS_PATH)

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(dim)  # L2 distance index
    index.add(embeddings)  # Add embeddings to index
    return index
faiss_index = build_faiss_index(embeddings)



In [38]:

query = "WHO AUTHOR THIS BOOK" 
formatted_results = retrieve_and_format_results(query, faiss_index, text_chunks, embedding_model)
print("RAG :(\n")
print(formatted_results)



---------------------------------------------------------------\['metamorphosi franz kafka project gutenberg ebook metamorphosi franz kafka translat david wylli ebook use anyon anywher cost almost restrict whatsoev may copi give away term project gutenberg licens includ ebook onlin copyright project gutenberg ebook detail ae pleas follow copyright guidelin file titl metamorphosi author franz kafka translat david wylli releas date august 16 2005 ebook 5200 first post may 13 2002 last updat may 20 2012 languag english charact set encod start project gutenberg ebook metamorphosi copyright c 2002 david wylli metamorphosi franz kafka translat david wylli one morn gregor samsa woke troubl dream found transform bed horribl vermm lay back ifh lift head littl could see brown belli slightli dome divid arch stiff section bed hardli abl cover seem readi slide offani moment mani leg piti thin compar size rest wave helplessli look 1134'] ['metamorphosi franz kafka approach us offer donat intern don