<a href="https://colab.research.google.com/github/01PrathamS/AI_receptionist/blob/main/rag_notebook/simple_local_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

In [None]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):

    print("[INFO] file doesn't exist, downloading...")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    filename = pdf_path

    response = requests.get(url)

    if response.status_code == 200:
      with open(filename, "wb") as file:
        file.write(response.content)
    else:
      print(f"[INFO] Failed to download the file. Status Code: {response.status_code}")

else:
    print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10


In [None]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
  cleaned_text = text.replace("\n", "").strip()
  return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
  doc = fitz.open(pdf_path)
  pages_and_texts = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text=text)
    pages_and_texts.append({"page_number": page_number-41,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count_raw": len(text.split(". ")),
                            "page_token_count": len(text) / 4,
                            "text": text})

  return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [None]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 44,
  'page_char_count': 587,
  'page_word_count': 94,
  'page_sentence_count_raw': 4,
  'page_token_count': 146.75,
  'text': '(Source: UNICEF, 1986, How to weigh and measure children: assessing the nutrition status of young children) Biochemical methods of assessing nutritional status Biochemical or laboratory methods of assessment include measuring a nutrient or its metabolite in the blood, feces, urine or other tissues that have a relationship with the nutrient. An example of this method would be to take blood samples to measure levels of glucose in the body. This method is useful for determining if an individual has diabetes. Figure 1.4 Measuring Blood Glucose Levels 44  |  Types of Scientific Studies'},
 {'page_number': 88,
  'page_char_count': 1654,
  'page_word_count': 273,
  'page_sentence_count_raw': 13,
  'page_token_count': 413.5,
  'text': 'the iron-containing hemoglobin molecule in red blood cells serves as the oxygen carrier. Wastes Out In the metabolism

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,308,42,1,77.0,Human Nutrition: 2020 Edition UNIVERSITY OF HA...
3,-38,210,30,1,52.5,Human Nutrition: 2020 Edition by University of...
4,-37,766,114,2,191.5,Contents Preface University of Hawai‘i at Māno...


In [None]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1122.01,172.31,9.97,280.5
std,348.86,551.68,86.27,6.18,137.92
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,744.0,110.0,4.0,186.0
50%,562.5,1194.0,182.5,10.0,298.5
75%,864.25,1571.25,238.0,14.0,392.81
max,1166.0,2273.0,394.0,32.0,568.25


In [None]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like elephants.")
assert len(list(doc.sents)) == 3

list(doc.sents)

[This is a sentence., This another sentence., I like elephants.]

In [None]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 844,
 'page_word_count': 117,
 'page_sentence_count_raw': 8,
 'page_token_count': 211.0,
 'text': 'Image by Allison Calabrese / CC BY 4.0 Korsakoff syndrome can cause similar symptoms as beriberi such as confusion, loss of coordination, vision changes, hallucinations, and may progress to coma and death. This condition is specific to alcoholics as diets high in alcohol can cause thiamin deficiency. Other individuals at risk include individuals who also consume diets typically low in micronutrients such as those with eating disorders, elderly, and individuals who have gone through gastric bypass surgery.5 Figure 9.10 The Role of Thiamin Figure 9.11 Beriberi, Thiamin Deficiency 5.\xa0Fact Sheets for Health Professionals: Thiamin. National Institute of Health, Office of Dietary Supplements. \xa0https://ods.od.nih.gov/factsheets/Thiamin-HealthProfessional/. Updated Feburary 11, 2016. Accessed October 22, 2017. Water-Soluble Vitamins  |  559'}

In [None]:
for item in tqdm(pages_and_texts):
  item['sentences'] = list(nlp(item["text"]).sents)
  item["sentences"] = [str(sentence) for sentence in item["sentences"]]
  item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_texts, k=1)

[{'page_number': 212,
  'page_char_count': 1291,
  'page_word_count': 192,
  'page_sentence_count_raw': 12,
  'page_token_count': 322.75,
  'text': 'The Beverage Panel recommends an even lower intake of calories from beverages than IOM—10 percent or less of total caloric intake. Table 3.10 Recommendations of the Beverage Panel Beverage Servings per day* Water ≥ 4 (women), ≥ 6 (men) Unsweetened coffee and tea ≤ 8 for tea, ≤ 4 for coffee Nonfat and low-fat milk; fortified soy drinks ≤ 2 Diet beverages with sugar substitutes ≤ 4 100 percent fruit juices, whole milk, sports drinks ≤ 1 Calorie-rich beverages without nutrients ≤ 1, less if trying to lose weight *One serving is eight ounces. Source: Beverage Panel Recommendations and Analysis. University of North Carolina, Chapel Hill. US Beverage Guidance Council. http://www.cpc.unc.edu/projects/nutrans/policy/beverage/us-beverage-panel. Accessed November 6, 2012. Sources of Drinking Water The Beverage Panel recommends that women drink at le

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1122.01,172.31,9.97,280.5,10.32
std,348.86,551.68,86.27,6.18,137.92,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,744.0,110.0,4.0,186.0,5.0
50%,562.5,1194.0,182.5,10.0,298.5,10.0
75%,864.25,1571.25,238.0,14.0,392.81,15.0
max,1166.0,2273.0,394.0,32.0,568.25,28.0


# smaller groups of text can be easier to inspect that large passages of text

In [None]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
  return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [None]:
## Loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_texts):
  item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                       slice_size=num_sentence_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_texts, k=1)

[{'page_number': 776,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': '',
  'sentences': [],
  'page_sentence_count_spacy': 0,
  'sentence_chunks': [],
  'num_chunks': 0}]

In [None]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1085,
  'page_char_count': 1694,
  'page_word_count': 223,
  'page_sentence_count_raw': 18,
  'page_token_count': 423.5,
  'text': 'Colon Health A substantial health benefit of whole grain foods is that fiber actively supports digestion and optimizes colon health. (This can be more specifically attributed to the insoluble fiber content of whole grains.) There is good evidence supporting that insoluble fiber prevents the irritating problem of constipation and the development of diverticulosis and diverticulitis. Diverticulosis is a benign condition characterized by outpouches of the colon. Diverticulitis occurs when the outpouches in the lining of the colon become inflamed. Interestingly, diverticulitis did not make its medical debut until the early 1900s, and in 1971 was defined as a deficiency of whole-grain fiber. According to the National Digestive Diseases Information Clearinghouse, 10 percent of Americans over the age of forty have diverticulosis, and 50 percent o

In [None]:
"""
Splitting each chunk into its own item..

We'd liek to embed each chunk of sentences into its own numerical representation.
That'll give us a good level of granularity.
Meaning, we can dive specifically into the text sample that was used in our model.
"""

"\nSplitting each chunk into its own item..\n\nWe'd liek to embed each chunk of sentences into its own numerical representation.\nThat'll give us a good level of granularity.\nMeaning, we can dive specifically into the text sample that was used in our model.\n"

In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [None]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 472,
  'sentence_chunk': 'Photo by Hope House Press on unsplash.com / CC0 https://unsplash.com/photos/PJzc7LOt2Ig Weight Management UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM “Obesogenic” is a word that has sprung up in the language of public health professionals in the last two decades. The Centers for Disease Control and Prevention (CDC) defines obesogenic as “an environment that promotes increased food intake, non-healthful foods, and physical inactivity.”1 1.\xa0Obesogenic Environments. Center for Disease Control and Prevention (CDC).https://www.cdc.gov/pcd/472 | Weight Management',
  'chunk_char_count': 619,
  'chunk_word_count': 82,
  'chunk_token_count': 154.75}]

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,732.99,110.88,183.25
std,347.79,447.53,71.36,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,313.5,42.5,78.38
50%,586.0,744.0,112.0,186.0
75%,890.0,1117.5,172.0,279.38
max,1166.0,1830.0,296.0,457.5


In [None]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25


In [None]:
# filter chunks of text for short chunks

# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 7.25 | Text: Human Nutrition: 2020 Edition
Chunk token count: 18.75 | Text: http://pressbooks.oer.hawaii.edu/humannutrition2/?p=519  Introduction | 991
Chunk token count: 16.5 | Text: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281
Chunk token count: 24.25 | Text: These activities are available in the web-based textbook and not available in the Magnesium | 643
Chunk token count: 27.75 | Text: In exchange, for the reabsorption of sodium and water, potassium is excreted. Regulation of Water Balance | 169


In [None]:
# Filter our DataFrame for rows with under 30 tokens

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [None]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 373,
  'sentence_chunk': 'available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome.\xa0An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/humannutrition2/?p=246 \xa0An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/humannutrition2/?p=246 Defining Protein | 373',
  'chunk_char_count': 710,
  'chunk_word_count': 98,
  'chunk_token_count': 177.5}]

In [None]:
"""
Embedding our text chunks

while humans understand text, machines understand numbers

what we'd like to do:
1. Turn our text chunks into numbers, specifically embeddings.

A useful numerical representation.
The best part about embeddings is that are a learned representation.

"""

"\nEmbedding our text chunks\n\nwhile humans understand text, machines understand numbers\n\nwhat we'd like to do:\n1. Turn our text chunks into numbers, specifically embeddings.\n\nA useful numerical representation.\nThe best part about embeddings is that are a learned representation.\n\n"

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda" if torch.cuda.is_available() else "cpu")

sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
             "Sentences can be embedded one by one or in a list.",
             "I like horses!"]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
  print(f"Sentence: {sentence}")
  print(f"Embedding: {embedding}")
  print("")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: The Sentence Transformer library provides an easy way to create embeddings.
Embedding: [-3.44286002e-02  2.95328386e-02 -2.33643521e-02  5.57257533e-02
 -2.19098590e-02 -6.47061085e-03  1.02848485e-02 -6.57803416e-02
  2.29718033e-02 -2.61121243e-02  3.80420350e-02  5.61403073e-02
 -3.68746556e-02  1.52787957e-02  4.37020436e-02 -5.19723557e-02
  4.89479825e-02  3.58103495e-03 -1.29750762e-02  3.54387029e-03
  4.23262678e-02  3.52606587e-02  2.49402542e-02  2.99176741e-02
 -1.99382659e-02 -2.39752606e-02 -3.33362678e-03 -4.30450290e-02
  5.72014563e-02 -1.32517833e-02 -3.54477912e-02 -1.13936048e-02
  5.55561110e-02  3.61094112e-03  8.88526984e-07  1.14027057e-02
 -3.82230096e-02 -2.43547955e-03  1.51314326e-02 -1.32706380e-04
  5.00659980e-02 -5.50876781e-02  1.73444971e-02  5.00959195e-02
 -3.75959463e-02 -1.04463417e-02  5.08322306e-02  1.24861458e-02
  8.67377371e-02  4.64142971e-02 -2.10690070e-02 -3.90251726e-02
  1.99698308e-03 -1.42345503e-02 -1.86794922e-02  2.826695

In [None]:
embeddings.shape

(3, 768)

In [None]:
# %%time

# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#   item["embedding"]  = embedding_model.encode(item["sentence_chunk"])

In [None]:
%%time

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[419]

CPU times: user 528 µs, sys: 0 ns, total: 528 µs
Wall time: 602 µs


'often. •Calm your “sweet tooth” by eating fruits, such as berries or an apple. •Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow-releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations. Adding

In [None]:
len(text_chunks)

1679

In [None]:
%%time

# embedding_model.to("cuda")
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)
text_chunk_embeddings

CPU times: user 25min 11s, sys: 5min 5s, total: 30min 17s
Wall time: 32min 59s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])

In [None]:
## Save Embedding to file

pages_and_chunks_over_min_token_len[419]

{'page_number': 277,
 'sentence_chunk': 'often. •Calm your “sweet tooth” by eating fruits, such as berries or an apple. •Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow-releasing carbohydrates, such as alginate, to give shape and texture to 

In [None]:
# save embedding to file

text_chunk_embedding_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embedding_df_save_path = "text_chunks_and_embedding_df.csv"
text_chunk_embedding_df.to_csv(embedding_df_save_path, index=False)

In [None]:
text_chunks_and_embedding_df_load  = pd.read_csv(embedding_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5


In [None]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embedding_df.csv")

text_chunks_and_embedding_df['embedding'] = text_chunks_and_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embedding_df

In [None]:
embeddings.shape

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

In [None]:
# 1. Define the query
query = "good foods for protein"
print(f"Query: {query}")

# Embed the query
# it's import to embed you query with the same model you embedding you passages
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embedding shape: {larger_embeddings.shape}")

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"[INFO] Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")


In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
  wrapped_text = textwrap.fill(text, wrap_length)
  print(wrapped_text)

In [None]:
query = "good foods for protein"
print(f"Query: '{query}' \n")
print("Results:")

for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
  print(f"Score: {score:.4f}")
  print("Text:")
  print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
  print(f"Page number")

In [None]:
import fitz

pdf_path = "human-nutrition-text.pdf"
doc = fitz.open(pdf_path)
page = doc.load_page(411+41)

img = page.get_pixmap(dpi=300)

doc.close()

img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))

# Display the image using Matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off')
plt.show()

In [None]:
# Similarity measures: dot product and cosine similarity

import torch

def dot_product(vector1, vector2):
  return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
  dot_product = torch.dot(vector1, vector2)

  norm_vector1 = torch.norm(vector1)
  norm_vector2 = torch.norm(vector2)

  return dot_product / (norm_vector1 * norm_vector2)

vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector3 = torch.tensor([7, 8, 9], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

print(f"Dot product: {dot_product(vector1, vector2)}")
print(f"Cosine similarity: {cosine_similarity(vector1, vector2)}")

# Cosine similarity
print(f"Cosine similarity: {cosine_similarity(vector1, vector3)}")
print(f"Cosine similarity: {cosine_similarity(vector1, vector4)}")
print(f"Cosine similarity: {cosine_similarity(vector2, vector3)}")
print(f"Cosine similarity: {cosine_similarity(vector2, vector4)}")
print(f"Cosine similarity: {cosine_similarity(vector3, vector4)}"

In [None]:
## Functionizing our semantic search pipeline

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):

  query_embedding = model.encode(query, convert_to_tensor=True)
  start_time = timer()
  dot_scores = util.dot_score(query_embedding, embeddings)[0]
  end_time = timer()

  if print_time:
    print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

  scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)

  return scores, indices



In [None]:
def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Finds relevant passages given a query and prints them out along with their scores.
    """
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    # Loop through zipped together scores and indices from torch.topk
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[idx]['page_number']}")
        print("\n")

In [None]:
query="foods high in fiber"
# retrieve_relevant_resources(query=query, embeddings=embeddings)
print_top_results_and_scores(query=query, embeddings=embeddings)