<a href="https://colab.research.google.com/github/DineshThumma9/miniature-octo-memory/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download pdf using request and reuse if already exists

In [None]:
import os

import numpy as np
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] file doesnt exist , downloading")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    filename = pdf_path

    response = requests.get(url)

    if response.status_code == 200:
        with open(filename,"wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file .Status Code : {response.status_code}")
else:
    print(f"File Exists")


#Format Pdf


In [None]:
%pip install pymupdf

In [None]:

import  pymupdf
from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
    cleaned_text = text.replace("\n"," ").strip()
    return cleaned_text



In [None]:

def open_and_read_pdf(pdf_path:str)->list[dict]:
    docs = pymupdf.open(pdf_path)
    pages_and_text = []
    for page_no , page in tqdm(enumerate(docs)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({
            "page_number" : page_no-41,
            "page_char_cnt" : len(text),
            "page_word_cnt" : len(text.split(" ")),
            "page_sentence_count_raw" : len(text.split(". ")),
            "page_token_cnt" : len(text)/4,
            "text" : text

        }
        )

    return pages_and_text


pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

In [None]:
import  random

random.sample(pages_and_texts,k=3)

In [None]:
import  pandas as  pd

df = pd.DataFrame(pages_and_texts)
df.head()


In [None]:
df.describe().round(2)

In [None]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like elephants")
assert len(list(doc.sents)) == 3
list(doc.sents)


In [None]:

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"] ]

    item["page_sentence_cnt_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts,k=2)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:

num_sentence_chunck_size = 15


def split_list(
        input_list : list[str],
        slice_size : int = num_sentence_chunck_size

) -> list[list[str]]:

    return [input_list[i:i+slice_size] for i in range(0,len(input_list) , slice_size) ]



test_list = list(range(25))
split_list(test_list)

In [None]:


for item in tqdm(pages_and_texts):
    item["sentence_chucks"]  = split_list(
        input_list= item["sentences"],
        slice_size=num_sentence_chunck_size
    )

    item["num_chunks"] = len(item["sentence_chucks"])


In [None]:
random.sample(pages_and_texts,k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:
import re

pages_and_chuncks = []

for item in tqdm(pages_and_texts):
    for sentence_chunck in item["sentence_chucks"]:

        chunck_dic={}
        chunck_dic["page_no"]  = item["page_number"]

        joined_sentence_chunck = "".join(sentence_chunck).replace("  "," ").strip()
        joined_sentence_chunck = re.sub(r'\.([A-Z])' ,r' .\1' , joined_sentence_chunck)


        chunck_dic["sentence_chuck"] = joined_sentence_chunck

        chunck_dic["chunck_char_count"] = len(joined_sentence_chunck)
        chunck_dic["chunck_word_count"]  = len([word  for word in joined_sentence_chunck.split(" ")])
        chunck_dic["chunck_token_count"] = len(joined_sentence_chunck)/4

        pages_and_chuncks.append(chunck_dic)

len(pages_and_chuncks)


In [None]:
random.sample(pages_and_chuncks,k=3)

In [None]:
df = pd.DataFrame(pages_and_chuncks)
df.describe().round(2)

In [None]:
df.head()

In [None]:
min_token_length=30
for row in df[df["chunck_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunck token count:{row[1]["chunck_token_count"]} | Text : {row[1]["sentence_chuck"]}')

In [None]:
pages_and_chuncks_over_min_token_len = df[df["chunck_token_count"] > min_token_length].to_dict(orient ="records")
pages_and_chuncks_over_min_token_len[:2]

In [None]:
random.sample(pages_and_chuncks_over_min_token_len,k=4)

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(

    model_name_or_path="all-mpnet-base-v2",
    device="cpu"
)

sentence = [
 "How does it works",
    "Capture meaning in numeric reprsesenation"
]

embeddinngs = embedding_model.encode(sentence)
embedding_dict = dict(
    zip(sentence ,embeddinngs)
)

for sentence,embeddinngs in embedding_dict.items():
    print(f"Sentence  : {sentence}")
    print(f"Embeddings:{embeddinngs}")

In [None]:
embeddinngs[1].shape

In [None]:
embeddinng = embedding_model.encode("Hi")
embeddinng1 = embedding_model.encode("Hello")
print(embeddinng)
print(embeddinng1)

In [None]:
# %%time
#
#
# embedding_model.to("cpu")
#
# for item in tqdm(pages_and_chuncks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chuck"])

In [None]:
%%time

import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())  # Should return your CUDA version
print(torch.cuda.get_device_name(0))  # Should return your GPU name (if available)
embedding_model.to("cuda")

for item in tqdm(pages_and_chuncks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chuck"])

In [None]:
%%time

text_chuncks = [item["sentence_chuck"] for item in pages_and_chuncks_over_min_token_len]
text_chuncks[419]


In [None]:
len(text_chuncks)


In [None]:
%%time

text_chuncks_embeddings = embedding_model.encode(
    text_chuncks,
    batch_size = 32,
    convert_to_tensor = True
)

text_chuncks_embeddings

In [None]:
text_chuncks_and_embeddings_df = pd.DataFrame(
    pages_and_chuncks_over_min_token_len
)
embeddings_df_save_path = "text_chuncks_and_embeddings_df.csv"
text_chuncks_and_embeddings_df.to_csv(embeddings_df_save_path,index = False)


In [None]:
text_chuncks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chuncks_and_embeddings_df_load.head()

In [None]:
print(type(text_chuncks_and_embeddings_df["embedding"][0]))

In [None]:

import numpy as np


device = "cuda" if torch.cuda.is_available() else "cpu"

text_chuncks_and_embeddings_df = pd.read_csv(embeddings_df_save_path)


print(type(text_chuncks_and_embeddings_df["embedding"][0]))
# Ensure the column has no NaN or invalid values


text_chuncks_and_embeddings_df["embedding"] = text_chuncks_and_embeddings_df["embedding"].fillna("[]")

text_chuncks_and_embeddings_df["embedding"] = text_chuncks_and_embeddings_df["embedding"].apply(lambda x : np.fromstring(x.strip("[]"),sep = " "))




pages_and_chuncks = text_chuncks_and_embeddings_df.to_dict(orient = "records")



text_chuncks_and_embeddings_df


In [None]:


# Check the shapes of all embeddings
embedding_shapes = [embedding.shape for embedding in text_chuncks_and_embeddings_df["embedding"]]

# Find the maximum shape
max_length = max(shape[0] for shape in embedding_shapes)

# Pad or truncate embeddings to the maximum length
def pad_or_truncate(embedding, max_length):
    if len(embedding) < max_length:
        return np.pad(embedding, (0, max_length - len(embedding)), mode='constant')
    return embedding[:max_length]

text_chuncks_and_embeddings_df["embedding"] = text_chuncks_and_embeddings_df["embedding"].apply(
    lambda x: pad_or_truncate(x, max_length)
)

# Stack embeddings
embeddings = torch.tensor(np.stack(text_chuncks_and_embeddings_df["embedding"].to_list(), axis=0))



embeddings

In [None]:
print(type(embeddings))
print(type(embeddings[6]))


In [None]:
embeddings.shape

In [None]:
from sentence_transformers import  util

embeddings_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2" , device = device)


query = "Good Food for Proteins"
print(f"Query:{query}")
query_embeddings = embedding_model.encode(query,convert_to_tensor = True)
# Ensure both tensors have the same dtype
query_embeddings = query_embeddings.to(dtype=torch.float32)
query_embeddings = query_embeddings.to(device = device)
embeddings = embeddings.to(dtype=torch.float32)
embeddings = embeddings.to(device=device)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embeddings , b = embeddings)[0]
end_time = timer()


print(f"[INFO] Time Taken to get scores on {len(embeddings)} embeddings : {end_time-start_time:.5f} seconds.")


top_results_dot_product = torch.topk(dot_scores,k=5)
top_results_dot_product



In [None]:
print(type(tuple(top_results_dot_product)))

top_results_dot_product =  tuple(top_results_dot_product)

print(top_results_dot_product)

print(top_results_dot_product[1])


In [None]:
indexes = list(top_results_dot_product[1])

for idx in  indexes:
    print(pages_and_chuncks[idx]["sentence_chuck"])
    print()

In [None]:
import textwrap

def print_wrapped(text , wrap_lenth = 80):
    wrapped_text = textwrap.fill(text,wrap_lenth)
    print(wrapped_text)




In [None]:

for score, idx in zip(scores, indexes):
    chunk = pages_and_chuncks[idx]
    print_wrapped(f"Page Score   : {score}")
    print_wrapped(f"Content      : {chunk['sentence_chuck']}")
    print_wrapped(f"Page Number  : {chunk['page_no']}")
    print()


In [None]:
import pymupdf

pdf_path = "human-nutrition-text.pdf"
doc = pymupdf.open(pdf_path)
page = doc.load_page(7)


img = page.get_pixmap(dpi=300)


img.save("output_filename.png")

img_array = np.frombuffer(img.samples_mv,
                          dtype = np.uint8).reshape(img.h,img.w,img.n)
import matplotlib.pyplot as plt

plt.figure(figsize=(13,10))
plt.imshow(img_array)
plt.title(f"Query: {query} | Most Relevent page: ")
plt.axis("off")
plt.show()

In [None]:
def dot_product(vec1,vec2):
    return torch.dot(vec1,vec2)


def cosine_sim(vec1,vec2):
    dot_product  = torch.dot(vec1,vec2)
    norm1 = torch.sqrt(torch.sum(vec1**2))
    norm2 = torch.sqrt(torch.sum(vec2**2))
    return dot_product/(norm1*norm2)

vec1 = torch.tensor([1,2,3] , dtype=torch.float32)
vec2 = torch.tensor([1,2,3] , dtype=torch.float32)
vec3 = torch.tensor([1,2,3] , dtype=torch.float32)
vec4 = torch.tensor([1,2,3] , dtype=torch.float32)

In [None]:
def retrive_relevent_resources(
        query:str,
        embeddings:torch.tensor,
        model:SentenceTransformer = embeddings_model,
        n_resources_to_return : int = 5,
        print_time:bool = True
):

    query_embeddings = model.encode(query,convert_to_tensor=True)

    start_time = timer()
    dot_score = util.dot_score(query_embeddings,embeddings)[0]
    end_time = timer()


    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings:{end_time-start_time:.5f} seconds")


    scores1,indices = torch.topk(
        input=dot_scores,
        k=n_resources_to_return
    )


    return scores1,indices


def print_top_results_and_scores(
        query:str,
        embeddings:torch.tensor,
        pages_and_chuncks:list[dict]=pages_and_chuncks,

):
    indexes = list(top_results_dot_product[1])
    scores = list(top_results_dot_product[0])
    for score ,idx in  zip(scores,indexes):
        print_wrapped(f"Page Score : {score}")
        print_wrapped(pages_and_chuncks[idx]["sentence_chuck"])
        print_wrapped(f"Page Number : {pages_and_chuncks[idx]['page_no']}")
        print()




In [None]:
!pip install transformers accelerate
!pip install bitsandbytes


In [None]:
from huggingface_hub import login
login()


In [None]:
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

quantize_config  = BitsAndBytesConfig(load_in_4bit=True,
                                      bnb_4bit_compute_type=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability >= 8):
    attn_impl = "flash_attention_2"
else:
    attn_impl = "sdpa"



model_id = "google/gemma-3-4b-it"
snapshot_download(repo_id=model_id, repo_type="model")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage = False,
    attn_implementation = attn_impl
)


llm_model.to("cuda")


llm_model

In [None]:


# Get the compute capability of the first CUDA device
device_capability = torch.cuda.get_device_capability(0)[0]
print(device_capability)

In [None]:
input = "What are macro nutrients and what roles do they play in the human body"
print(f"Input text : {input}")

dialogue_template = [
    {"role" : "user",
     "content" : input}
]


prompt = tokenizer.apply_chat_template(
    conversation = dialogue_template,
    tokenize=False,
    add_generation_prompt = True

)


print(f"\n Prompt Template (formatted) :\n {prompt}")

In [None]:
tokenized = tokenizer(prompt, return_tensors="pt")
print("Vocab size:", tokenizer.vocab_size)
print("Input IDs:", tokenized['input_ids'])
print("Max token ID in input:", tokenized['input_ids'].max())


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import torch
torch.cuda.synchronize()


# your previous code
input_ids = tokenizer.apply_chat_template(
    conversation=dialogue_template,
    return_tensors="pt",
    add_generation_prompt=True
)

# Check if token IDs are within the vocab size range
assert input_ids.max() < tokenizer.vocab_size, "Token IDs exceed vocab size"

# Move to GPU
input_ids = input_ids.to("cuda")

print(f"Input IDs shape: {input_ids.shape}")
print(f"Max token ID: {input_ids.max()}")
print(f"Min token ID: {input_ids.min()}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Full tensor: {input_ids}")
