In [62]:
import os
from langchain_community.vectorstores import FAISS
import google.generativeai as genai
from streamlit import session_state as ss
import json


In [105]:
path = os.getcwd()

# Loading embedings
faiss_index = path + "/faiss_index"

# Loading all the data files 
data_source = path + "/data/data.txt"
pdf_source = path + "/data/resume.pdf"

google_api_key = os.getenv("GEMINI_API_KEY")

In [106]:
genai.configure(api_key=google_api_key)

In [110]:
import fitz
path = os.getcwd()
pdf_path = path + "/data/resume.pdf"

In [111]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

def open_read_pdf(pdf_path: str) -> list[dict]:
    
    pdf_document = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in enumerate(pdf_document.pages()):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_text.append({"page_number": page_number,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_text

In [115]:
pages_and_text = open_read_pdf(pdf_path)
# Remove the last page as it is empty
pages_and_text.pop(-1)


{'page_number': 26,
 'page_char_count': 0,
 'page_word_count': 1,
 'page_sentence_count_raw': 1,
 'page_token_count': 0.0,
 'text': ''}

In [117]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,26.0,26.0,26.0,26.0,26.0
mean,12.5,2488.69,376.96,17.23,622.17
std,7.65,187.54,27.31,2.66,46.88
min,0.0,1879.0,295.0,12.0,469.75
25%,6.25,2439.25,365.0,16.0,609.81
50%,12.5,2527.5,377.5,17.0,631.88
75%,18.75,2579.75,394.0,18.75,644.94
max,25.0,2706.0,427.0,22.0,676.5


In [118]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

for item in pages_and_text:
    # Add sentences to each page
    item["sentences"] = list(nlp(item["text"]).sents)
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

In [121]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,12.5,2488.69,376.96,17.23,622.17,18.65
std,7.65,187.54,27.31,2.66,46.88,3.75
min,0.0,1879.0,295.0,12.0,469.75,13.0
25%,6.25,2439.25,365.0,16.0,609.81,16.0
50%,12.5,2527.5,377.5,17.0,631.88,17.5
75%,18.75,2579.75,394.0,18.75,644.94,20.75
max,25.0,2706.0,427.0,22.0,676.5,28.0


In [122]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in pages_and_text:
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

In [123]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,26.0,26.0,26.0,26.0,26.0,26.0,26.0
mean,12.5,2488.69,376.96,17.23,622.17,18.65,2.27
std,7.65,187.54,27.31,2.66,46.88,3.75,0.45
min,0.0,1879.0,295.0,12.0,469.75,13.0,2.0
25%,6.25,2439.25,365.0,16.0,609.81,16.0,2.0
50%,12.5,2527.5,377.5,17.0,631.88,17.5,2.0
75%,18.75,2579.75,394.0,18.75,644.94,20.75,2.75
max,25.0,2706.0,427.0,22.0,676.5,28.0,3.0


In [126]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in pages_and_text:
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

59

In [127]:
import random
random.sample(pages_and_chunks, 5)

[{'page_number': 23,
  'sentence_chunk': "what project is Saurav most proud off?The project I am most proud of is the RAG pipeline I developed during my time as a Graduate Research Assistant. It involved integrating local LLMs with FAISS clustering to build a private AI service tailored to specific research needs. The complexity of the project, from deploying advanced AI techniques to managing vast datasets with PySpark and Apache Airflow, was incredibly challenging yet rewarding. It pushed my technical limits and gave me hands-on experience with cutting-edge technology. Currently, I am working on so many different projects that I could share with you soon!What is Saurav Mestry's passion?I am passionate about Software Development, Data Mining and Analytics, Machine Learning, and Conversational AI, but most of all, I am passionate about solving customers' problems using the technologies above. Tell me about Saurav's strengths | what about Saurav's strengths?My strengths lie in my analyt

In [128]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,59.0,59.0,59.0,59.0
mean,12.68,1095.46,165.42,273.86
std,7.91,442.51,65.43,110.63
min,0.0,51.0,9.0,12.75
25%,5.5,868.5,134.0,217.12
50%,13.0,1111.0,166.0,277.75
75%,20.0,1391.0,206.0,347.75
max,25.0,2041.0,311.0,510.25


In [131]:
df.head(1)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,0,I belong to a humble middle class family. I wa...,931,166,232.75


In [51]:
result = genai.embed_content(
    model="models/text-embedding-004",
    content=content_data,
    task_type="retrieval_document",
    title="profile"
    )


In [132]:
for item in pages_and_chunks:
    item["embedding"] = genai.embed_content(
        model="models/text-embedding-004",
        content=item["sentence_chunk"],
        task_type="retrieval_document",
        title="profile"
    )


In [138]:
pages_and_chunks[3]

{'page_number': 1,
 'sentence_chunk': "the systems and infrastructure that support our daily lives, while simultaneously establishing a solid, respectable career to support my family. Everyone has an epiphany in life that becomes a pivotal moment. When I was pursuing my undergraduate degree in Electrical and Computer Engineering, I had one such experience. The world craves innovative methods that can transform data into knowledge-rich treasure troves, bridging the gap between data and information. Today, in the era of technology-enhanced consumerism, every aspect of our everyday activities is captured in digital packets, which are subsequently analyzed and used to develop a company's marketing approach. Due to the growing connectedness and availability of technology, the quantity and variety of data collected by enterprises today are astounding. The issue with amassing a large volume of data is employing an efficient technique to access that data, regardless of the number of queries or

In [139]:
# Now store 'page_number', 'sentence_chunk', 'chunk_word_count', 'chunk_token_count', 'embedding'['embedding'] into new dictionary

embeddings = []
for item in pages_and_chunks:
    embeddings.append({
        "page_number": item["page_number"],
        "sentence_chunk": item["sentence_chunk"],
        "chunk_word_count": item["chunk_word_count"],
        "chunk_token_count": item["chunk_token_count"],
        "embedding": item["embedding"]["embedding"]
    })



In [146]:
# Store pages_and_chunks into a json file
with open('data/embeddings.json', 'w') as f:
    json.dump(embeddings, f)
    

In [143]:
# Convert the embeddings to a DataFrame
df = pd.DataFrame(embeddings)
len(df)

59

In [153]:
# Save pages_and_chunks as a dictionary
with open('data/pages_and_chunks.json', 'w') as f:
    json.dump(pages_and_chunks, f)

In [147]:
# Save the DataFrame to a CSV file for later use
df.to_csv("data/embeddings.csv", index=False)

In [164]:
df.head(59)

Unnamed: 0,page_number,sentence_chunk,chunk_word_count,chunk_token_count,embedding
0,0,I belong to a humble middle class family. I wa...,166,232.75,"[-0.00085764396, -0.026951997, -0.102263525, -..."
1,0,I am open to relocation at my own expense. The...,230,368.25,"[-0.0039868755, -0.01167271, -0.087820984, -0...."
2,0,"In light of these considerations, I deemed it ...",16,24.5,"[0.017440695, -0.025678309, -0.07075134, -0.02..."
3,1,the systems and infrastructure that support ou...,202,332.5,"[0.025076663, -0.056620877, -0.045003872, 0.00..."
4,1,This experience made me understand that it is ...,193,312.5,"[-0.0050783064, -0.016273735, -0.060798004, 0...."
5,2,information systems from your highly regarded ...,208,335.75,"[0.029690873, -0.044833057, -0.07459464, -0.02..."
6,2,The project revolved around customer behavior ...,151,277.75,"[-0.027738323, -0.053028014, -0.04565362, -0.0..."
7,3,accordingly. I used Python’s NLTK and TextBlob...,179,320.5,"[-0.008610588, -0.04613524, -0.059771553, 0.02..."
8,3,Finding the desired signal in the dataset was ...,155,239.0,"[-0.005382348, -0.023848232, -0.059643026, -0...."
9,3,"Aside from that, I was actively involved in or...",23,35.75,"[0.010427879, -0.036940105, -0.009767175, -0.0..."


In [167]:
df['sentence_chunk'].iloc[58]

'how did Saurav created you? "I was built using Python, Streamlit library and many more frameworks. Want to know more?You can contact me at <a href = mailto: art.sauravm@arizona.edu>email</a> what commands do you support?I can answer questions about my education, experience, skills, and interests. I can also provide information about my current job and contact details. If you have any other questions, feel free to ask!what has Saurav asked you not to say about him because that could reflect badly of him?No one has asked me to withhold any information about anyone. I am here to provide you with accurate information about myself.'

In [168]:
# Check if all the embeddings column entries are of the same length
df['embedding'].apply(len).value_counts()

embedding
768    59
Name: count, dtype: int64

In [169]:
import faiss
import numpy as np
import pandas as pd

In [170]:
df['embedding'] = df['embedding'].apply(np.array)

In [174]:
text_embeddings = np.array(df['embedding'].to_list()).astype('float32')

# Step 2: Initialize FAISS Index (using IndexFlatL2)
embedding_dim = text_embeddings.shape[1] # Dimensionality of the embeddings
index = faiss.IndexFlatL2(embedding_dim) # L2 distance is the Euclidean distance

# Step 3: Add the embeddings to the index
index.add(text_embeddings)


In [202]:
# Lets test the index with a random query 
query = "sonography tests" 
query_embedding = genai.embed_content(
    model="models/text-embedding-004",
    content=query,
    task_type="retrieval_document",
    title="profile"
)["embedding"]

In [203]:
query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)

# Perform a search
k = 5 # Number of results to return
distances, indices = index.search(query_embedding, 1)

# Display the results
results = df.iloc[indices[0]]
results["distance"] = distances[0]
results


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results["distance"] = distances[0]


Unnamed: 0,page_number,sentence_chunk,chunk_word_count,chunk_token_count,embedding,distance
8,3,Finding the desired signal in the dataset was ...,155,239.0,"[-0.005382348, -0.023848232, -0.059643026, -0....",0.952613


In [204]:
results['sentence_chunk'].iloc[0]

"Finding the desired signal in the dataset was a monumental undertaking. I used FFT analysis and a band-pass filtering module to get the appropriate fetal movement signal. In a short time, I constructed a prototype circuit that could stimulate fetal movement with 73% accuracy. Based on the engineer's input, my prototype passed the sonography equipment validation criteria. Due to the short term of my internship, doing sonography tests on pregnant women would have been expensive and time-consuming. So I plunged an analog clock into jelly to resemble a newborn in the womb. Analog watch ticking can be considered to indicate a baby's heart rate; jelly can be amniotic fluid. This method enabled me to conduct tests with results that resembled radiography reports. At IIT Kanpur, I was actively involved in student activities and extracurriculars. As a member of the fine arts club, I painted the walls of our Olympic-sized swimming pool in my first year."

In [187]:
# SAVE THE FAISS INDEX
faiss.write_index(index, faiss_index)


In [216]:
# Load the FAISS index
index = faiss.read_index(faiss_index)

In [226]:
# Step 5: Custom Faiss-based Retriever with Metadata
def faiss_retriever(index, query_vector, df, k=1):
    query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
    
    # Perform search
    distances, indices = index.search(query_vector, k)
    
    # Filter by score_threshold
    results = []
    for dist, idx in zip(distances[0], indices[0]):   
        result = {
            "sentence_chunk": df.iloc[idx]["sentence_chunk"],
            "page_number": df.iloc[idx]["page_number"],
            "distance": dist
        }
        results.append(result)

    return results


In [227]:
query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)  # Ensure query_embedding is correctly formatted
index = faiss.read_index(faiss_index)
retrieved = faiss_retriever(index, query_embedding, df)

for result in retrieved:
    print(f"Sentence: {result['sentence_chunk']}, Page: {result['page_number']}, Distance: {result['distance']}")

Sentence: Finding the desired signal in the dataset was a monumental undertaking. I used FFT analysis and a band-pass filtering module to get the appropriate fetal movement signal. In a short time, I constructed a prototype circuit that could stimulate fetal movement with 73% accuracy. Based on the engineer's input, my prototype passed the sonography equipment validation criteria. Due to the short term of my internship, doing sonography tests on pregnant women would have been expensive and time-consuming. So I plunged an analog clock into jelly to resemble a newborn in the womb. Analog watch ticking can be considered to indicate a baby's heart rate; jelly can be amniotic fluid. This method enabled me to conduct tests with results that resembled radiography reports. At IIT Kanpur, I was actively involved in student activities and extracurriculars. As a member of the fine arts club, I painted the walls of our Olympic-sized swimming pool in my first year., Page: 3, Distance: 0.95261335372

In [None]:
Sentence: accordingly. I used Python’s NLTK and TextBlob libraries for this task, which facilitated the extraction of meaningful insights from large text corpora. The project was completed 30% earlier than anticipated, thanks to my implementation of agile methodologies and effective sprint management. I coordinated with cross-functional teams, regularly reporting progress and adapting the project roadmap based on stakeholder feedback. This proactive approach not only accelerated the delivery but also improved the overall efficiency of business operations by 5%. This internship sharpened my skills in applying machine learning algorithms to real-world business problems and gave me a strong foundation in handling large datasets, performing predictive analytics, and generating data-driven insights. My ability to work across the entire machine learning pipeline—from data collection and cleaning to model building and deployment—aligns with the core competencies required in data engineering and software engineering roles. One of my other projects was to construct hardware to validate sonography-detected newborn movements. I surveyed pregnant mothers at different stages of pregnancy to get first-hand data on fetal movement. This gave us statistics and end-user input on our goods., Page: 3, Distance: 1.0296571254730225
Sentence: This forced the neural network to learn more robust, Page: 5, Distance: 1.052544355392456
Sentence: Another area I am working on is delegation. I have a tendency to take on more than I should because I feel responsible for ensuring things are done right. However, I am actively working on trusting others more and collaborating in a way that allows for shared ownership of tasks. Lastly, I can sometimes get lost in, Page: 23, Distance: 1.0927051305770874
Sentence: In light of these considerations, I deemed it prudent to study engineering to innovate and improve, Page: 0, Distance: 1.1449812650680542

In [209]:
retrieved

[]