# Semantic Chunking
Unlike fixed length chunking which we saw in simple RAG, Semantic chunking divides the content based on similarity.

In [2]:
import pymupdf
import ollama
import numpy as np

In [16]:
#Extracting text from pdf files

def extract_text_from_pdf(pdf_path):
    my_pdf = pymupdf.open(pdf_path)
    all_text = ''

    for page in my_pdf:
        all_text += page.get_text("text") + " "

    return all_text.strip()

pdf_path = '/Users/mahikshitk/Documents/cover letters/Mahikshit_Kurapati_cover_letter_world_bank.pdf'
extracted_text = extract_text_from_pdf(pdf_path)

print(extracted_text[500:])

chnologies serve the varied needs of developing nations. 
 
At Intripid, I spearheaded the development of an AI travel product that not only captivated over 10,000 
users but also demonstrated my ability to align innovative technology with market demands. By designing 
a multi-agent Retrieval Augmented Generation model, I significantly enhanced the efficiency of itinerary 
generation by 60%. This experience taught me the importance of understanding user needs and 
collaborating with diverse teams, skills that I believe are crucial for driving successful AI initiatives at the 
World Bank. My role involved engaging with venture capital firms and academic institutions, allowing me 
to navigate complex stakeholder interests effectively while securing $150K in investment capital to propel 
our vision forward. This blend of technical expertise and strategic communication positions me well to 
support the World Bank's mission of leveraging AI for transformative development in emerging markets

In [38]:
def get_embedding(text, model="nomic-embed-text:latest"):

    response = ollama.embed(model = model, input=text)
    return np.array(response.embeddings[0])

sentences = extracted_text.split(". ")
embeddings = [get_embedding(sentence) for sentence in sentences]


In [39]:
def cosine_similarity(vec_a, vec_b):

    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

similarities = [cosine_similarity(embeddings[i], embeddings[i+1]) for i in range(len(embeddings)-1)]

In [40]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    if method == "percentile":
        threshold_value = np.percentile(similarities, threshold)

    elif method == "standard_deviation":
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        threshold_value = mean - (std_dev * threshold)

    elif method == "interquartile":
        q1, q3 = np.percentile(similarities, [25, 75])
        threshold_value = q1 - 1.5 * (q3-q1)

    else:
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")
    
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]

breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)
print("Breakpoints:", breakpoints)

Breakpoints: [0, 2, 3, 4, 5, 6, 7, 8, 9]


In [41]:
from cgitb import text


def split_into_chunks(sentences, breakpoints):
    chunks = []
    start = 0

    for bp in breakpoints:
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1

    chunks.append(". ".join(sentences[start:]))
    return chunks

text_chunks = split_into_chunks(sentences, breakpoints)

print(f"Number of semantic chunks: {len(text_chunks)}")

print(text_chunks[0])

Number of semantic chunks: 10
Mahikshit Kurapati 
+15717740349 | mahikshitkurapati@gmail.com | linkedin.com/in/mahikshitkurapati 
 
 
 
Dear team at World Bank, 
 
As a Machine Learning Engineer with a proven track record of transforming complex AI concepts into 
actionable solutions, I thrive in navigating diverse stakeholder landscapes.


In [42]:
def create_embeddings(text_chunks):

    return [get_embedding(chunk) for chunk in text_chunks]

chunk_embeddings = create_embeddings(text_chunks)

In [43]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):

    query_embedding = get_embedding(query)
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    top_indices = np.argsort(similarities)[-k:][::-1]


    return [(text_chunks[i], similarities[i]) for i in top_indices]

In [45]:
query = "Which all places did Mahikshit work at?"

top_chunks = semantic_search(query, text_chunks, chunk_embeddings, k=2)

print(f"Query: {query}")
for i, chunk in enumerate(top_chunks):
    print(f"Context {i+1}:\n{chunk}\n{'='*40}")

Query: Which all places did Mahikshit work at?
Context 1:
('\n \n \nRegards, \nMahikshit Kurapati', np.float64(0.6828681500401532))
Context 2:
('Mahikshit Kurapati \n+15717740349 | mahikshitkurapati@gmail.com | linkedin.com/in/mahikshitkurapati \n \n \n \nDear team at World Bank, \n \nAs a Machine Learning Engineer with a proven track record of transforming complex AI concepts into \nactionable solutions, I thrive in navigating diverse stakeholder landscapes.', np.float64(0.6551582148822156))
