In [2]:
import pytesseract
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path, is_scanned=False):
    if is_scanned:
        text = pytesseract.image_to_string(pdf_path)
    else:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text


In [3]:
def chunk_text(text, max_length=512):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield " ".join(words[i:i + max_length])


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2') 
def get_embeddings(text):
    return model.encode(text, convert_to_tensor=False)


  from tqdm.autonotebook import tqdm, trange





In [5]:
import faiss
import numpy as np

embedding_dim = 384 
index = faiss.IndexFlatL2(embedding_dim)

def add_to_index(text_chunks):
    embeddings = [get_embeddings(chunk) for chunk in text_chunks]
    index.add(np.array(embeddings))

def search(query, top_k=5):
    query_embedding = get_embeddings(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    return indices, distances 


In [6]:
class ChatMemory:
    def __init__(self):
        self.memory = []
    
    def add_interaction(self, user_query, system_response):
        self.memory.append({"query": user_query, "response": system_response})

    def get_context(self):
        return self.memory


In [7]:
def rerank_results(results, query):
    ranked = sorted(results, key=lambda x: semantic_similarity(query, x))
    return ranked

def semantic_similarity(query, text):
    query_embedding = get_embeddings(query)
    text_embedding = get_embeddings(text)
    return np.dot(query_embedding, text_embedding)  


In [8]:
def rag_pipeline(pdf_path, query, is_scanned=False):
    text = extract_text_from_pdf(pdf_path, is_scanned)

    chunks = list(chunk_text(text))

    add_to_index(chunks)

    results, distances = search(query)

    top_chunks = [chunks[i] for i in results[0]]
    reranked_results = rerank_results(top_chunks, query)
    
    return reranked_results


In [9]:
pdf_path = "Reboot_Leadership_and_the_Art_of.pdf" 
query = "What is the main topic discussed?"
result = rag_pipeline(pdf_path, query)
print(result)


['whose lives I touch upon who surely have dif ferent feelings, observations, and perceptions. Such dif ferent views are no less true or valid than my own. While the facts remembered by others may be different, our feelings are doubtlessly similar; maybe even more than true. The wise elders in my life encouraged me to honor such dif ferent feelings, such dif ferent truths, as I honor my own. I’ve tried to use my truth to advance an understanding of the world as I’ve come to see it. What’ s required of each of us, I believe, is to speak truth while releasing ourselves from the obligation to share the whole truth. This said, and for what it’ s worth, the bike was purple, and it had a great banana seat.Acknowledgments Many authors acknowledge the people in their lives who helped make their book a reality . When I read that a book would not have been possible were it not for the acknowledged folks, I’d always assumed such statements were merely polite hyperbole. Then I wrote my own book. N