# Simple RAG

1. Data Ingestion
2. Chunking
3. embedding
4. semantic search
5. response generation

In [131]:
import pymupdf
import ollama
from ollama import chat
import numpy as np
import re

In [91]:
def extract_text_from_pdf(pdf_path):
    mypdf = pymupdf.open(pdf_path)
    all_text = ''

    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text
    return all_text

In [93]:
def chunk_text(text, n, overlap):
    chunks = []

    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i+n])

    return chunks

In [138]:
pdf_path = "/Users/mahikshitk/Downloads/Mahikshit_Kurapati_resume (6).pdf"

extracted_text = extract_text_from_pdf(pdf_path)

text_chunks = chunk_text(extracted_text, 1000, 200)

print("Number of text chunks:", len(text_chunks))

print("\nFirst text chunk:")
print(text_chunks[0])


Number of text chunks: 7

First text chunk:
Mahikshit Kurapati
+1 5717740349 | mahikshitkurapati@gmail.com | linkedin.com/in/mahikshitkurapati | mkurapati.vercel.app | github.com/Archonz-crazy
PROFESSIONAL SUMMARY
Solution-oriented Machine Learning Engineer with a background in Artificial Intelligence with 2+ years of experience building
enterprise-level RAG models.
Specialties include: designing machine learning platforms, data pipeline optimization, improving
operational efficiency, solving ML problems end-to-end, ML/AI lifecycle management, applied machine learning solutions.
EXPERIENCE
AI Engineer | NForce One | Virginia, USA
October 2024 - present
• Collaborated with teams from Accenture to integrate legacy systems like Siebel and Prism with AI agents using Python,
LangChain, GPT-4o, and Azure ML Studio. Enhanced customer management workflows and cut costs by 60%.
• Engineered data pipelines using pgAdmin, retrieving and analyzing the data, ensuring 4x faster generation of busines

In [139]:
def create_embeddings(text, model="nomic-embed-text:latest"):
    response = ollama.embed(
        model=model,
        input=text
    )

    return response

response = create_embeddings(text_chunks)

In [141]:
def cosine_similarity(vec1, vec2):
    
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [142]:
def semantic_search(query, text_chunks, embeddings, k=5):

    query_embedding = create_embeddings(query)
    similarity_scores = []
    for i, chunk_embedding in enumerate(embeddings):
        similarity_score = cosine_similarity(query_embedding.embeddings, chunk_embedding)
        similarity_scores.append((i, similarity_score))

    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    top_indices = [index for index, _ in similarity_scores[:k]]

    return [text_chunks[index] for index in top_indices]

In [143]:
query = "What is Mahikshit's first job?"

top_chunks = semantic_search(query, text_chunks, response.embeddings, k=2)

print("Query:", query)

for i, chunk in enumerate(top_chunks):
    print(f"Context {i+1}:\n{chunk}\n=====================")


Query: What is Mahikshit's first job?
Context 1:
Mahikshit Kurapati
+1 5717740349 | mahikshitkurapati@gmail.com | linkedin.com/in/mahikshitkurapati | mkurapati.vercel.app | github.com/Archonz-crazy
PROFESSIONAL SUMMARY
Solution-oriented Machine Learning Engineer with a background in Artificial Intelligence with 2+ years of experience building
enterprise-level RAG models.
Specialties include: designing machine learning platforms, data pipeline optimization, improving
operational efficiency, solving ML problems end-to-end, ML/AI lifecycle management, applied machine learning solutions.
EXPERIENCE
AI Engineer | NForce One | Virginia, USA
October 2024 - present
• Collaborated with teams from Accenture to integrate legacy systems like Siebel and Prism with AI agents using Python,
LangChain, GPT-4o, and Azure ML Studio. Enhanced customer management workflows and cut costs by 60%.
• Engineered data pipelines using pgAdmin, retrieving and analyzing the data, ensuring 4x faster generation of bu

In [144]:
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that."

def generate_response(system_prompt, user_message, model="deepseek-r1:8b"):
    response = chat(
    model = model,
    messages = [{'role': 'system', 'content': system_prompt},
                {"role": "user", "content": user_message}])
        
    return response

user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
print(user_prompt)
user_prompt = f"{user_prompt}\nQuestion: {query}"

ai_response = generate_response(system_prompt, user_prompt)


Context 1:
Mahikshit Kurapati
+1 5717740349 | mahikshitkurapati@gmail.com | linkedin.com/in/mahikshitkurapati | mkurapati.vercel.app | github.com/Archonz-crazy
PROFESSIONAL SUMMARY
Solution-oriented Machine Learning Engineer with a background in Artificial Intelligence with 2+ years of experience building
enterprise-level RAG models.
Specialties include: designing machine learning platforms, data pipeline optimization, improving
operational efficiency, solving ML problems end-to-end, ML/AI lifecycle management, applied machine learning solutions.
EXPERIENCE
AI Engineer | NForce One | Virginia, USA
October 2024 - present
• Collaborated with teams from Accenture to integrate legacy systems like Siebel and Prism with AI agents using Python,
LangChain, GPT-4o, and Azure ML Studio. Enhanced customer management workflows and cut costs by 60%.
• Engineered data pipelines using pgAdmin, retrieving and analyzing the data, ensuring 4x faster generation of business
reports and Tableau dashboards 

In [145]:
cleaned_text = re.sub(r'<think>.*?</think>', '', ai_response.message.content, flags=re.DOTALL)
print(cleaned_text)



Based on the provided context, Mahikshit's first job isn't explicitly mentioned. The information available only details his current role as an AI Engineer at NForce One starting from October 2024.

**Answer:** Mahikshit's first job is not explicitly mentioned in the given context.


In [146]:
# Define the system prompt for the evaluation system
evaluate_system_prompt = "You are an intelligent evaluation system tasked with assessing the AI assistant's responses. If the AI assistant's response is very close to the true response, assign a score of 1. If the response is incorrect or unsatisfactory in relation to the true response, assign a score of 0. If the response is partially aligned with the true response, assign a score of 0.5."

# Create the evaluation prompt by combining the user query, AI response, true response, and evaluation system prompt
evaluation_prompt = f"User Query: {query}\nAI Response:\n{ai_response.message.content}\nTrue Response: it is at Research Center Imarat\n{evaluate_system_prompt}"

# Generate the evaluation response using the evaluation system prompt and evaluation prompt
evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)

# Print the evaluation response
cleaned_evaluation_text = re.sub(r'<think>.*?</think>', '', evaluation_response.message.content, flags=re.DOTALL)
print(cleaned_evaluation_text)



The AI assistant's response was "Mahikshit's first job" and the correct answer is that Mahikshit's first job is not explicitly mentioned in the context provided.

**Score: 0.5**

Explanation:
- The response correctly identifies that the first job isn't detailed, but it doesn't provide the true information as no specific job is mentioned.
- It partially aligns with the expectation of providing an answer, though incomplete.
