### Reading PDF file


---



In [None]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrition-text.pdf"

# Download PDF
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading...")

    # Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {reponse.status_code}")

else:
    print(f"File {pdf_path} exists.")

[INFO] File doesn't exist, downloading...
[INFO] The file has been download and saved as human-nutrition-text.pdf


In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.1 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.2 PyMuPDFb-1.24.1


In [None]:
import fitz

def text_formatter(text:str)->str:
  cleaned_text=text.replace("\n"," ").strip()

  return cleaned_text

def open_read_pdf(pdf_path: str) -> list[dict]:
  doc=fitz.open(pdf_path)
  pages_and_text=[]
  for page_num, page in enumerate(doc):
    text=page.get_text()
    text=text_formatter(text=text)
    pages_and_text.append({"Page number" : page_num-41,
                          "Total Char": len(text),
                          "Total words": len(text.split(" ")),
                           "Total Tokens": len(text)/4,
                          "Content": text
                           })
  return pages_and_text

pages_and_text=open_read_pdf(pdf_path=pdf_path)
pages_and_text[:3]

[{'Page number': -41,
  'Total Char': 29,
  'Total words': 4,
  'Total Tokens': 7.25,
  'Content': 'Human Nutrition: 2020 Edition'},
 {'Page number': -40,
  'Total Char': 0,
  'Total words': 1,
  'Total Tokens': 0.0,
  'Content': ''},
 {'Page number': -39,
  'Total Char': 320,
  'Total words': 54,
  'Total Tokens': 80.0,
  'Content': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'}]

In [None]:
import pandas as pd

df=pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,Page number,Total Char,Total words,Total Tokens,Content
0,-41,29,4,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,0.0,
2,-39,320,54,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,199.25,Contents Preface University of Hawai‘i at Mā...


In [None]:
df.iloc[42:,:]

Unnamed: 0,Page number,Total Char,Total words,Total Tokens,Content
42,1,93,20,23.25,PART I CHAPTER 1. BASIC CONCEPTS IN NUTRITIO...
43,2,0,1,0.00,
44,3,260,55,65.00,Image by Jim Hollyer / CC BY 4.0 Introducti...
45,4,1181,202,295.25,Learning Objectives By the end of this chapte...
46,5,1742,295,435.50,Macronutrients Nutrients that are needed ...
...,...,...,...,...,...
1203,1162,1676,252,419.00,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,1163,1617,254,404.25,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,1164,1715,261,428.75,Flashcard Images Note: Most images in the fla...
1206,1165,1733,268,433.25,ShareAlike 11. Organs reused “Pancreas Organ ...


In [None]:
df.describe().round(2)

Unnamed: 0,Page number,Total Char,Total words,Total Tokens
count,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,287.15
std,348.86,560.44,95.75,140.11
min,-41.0,0.0,1.0,0.0
25%,260.75,762.75,134.0,190.69
50%,562.5,1232.5,215.0,308.12
75%,864.25,1605.25,271.25,401.31
max,1166.0,2308.0,429.0,577.0


In [None]:
df.to_csv("Nutrition data.csv", index="False")

In [None]:
from spacy.lang.en import English

nlp=English()
nlp.add_pipe('sentencizer')

doc=nlp("This is one. This is two. third one.")
list(doc.sents)

[This is one., This is two., third one.]

In [None]:
from tqdm.auto import tqdm

In [None]:
for item in tqdm(pages_and_text):
  item["sentences"]=list(nlp(item["Content"]).sents)

  item["sentences"] = [str(sentence) for sentence in item["sentences"]]

   # Count the sentences
  item["page_sentence_count_spacy"] = len(item["sentences"])



  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
pages_and_text[1139]

{'Page number': 1098,
 'Total Char': 1518,
 'Total words': 271,
 'Total Tokens': 379.5,
 'sentences': ['Threats to Health  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Chronic Diseases  Chronic diseases are ongoing, life-threatening, and life-altering  health challenges.',
  'They are the leading cause of death worldwide.',
  ' Chronic conditions are increasing in frequency.',
  'They cause  significant physical and emotional suffering and are an impediment  to economic growth and vitality.',
  'It is important, now more than  ever, to understand the different risk factors for chronic disease and  to learn how to prevent their development.',
  ' The Risk Factors of Chronic Disease  A risk factor is a signal that your chances for acquiring a chronic  disease may be increased.',
  'You might liken a risk factor to the flags  that lifeguards sometimes set up at beaches.',
  'When you see these  flags, you know immediately that swimm

### Splitting into chuncks


In [None]:
chunck_size=10

def split_list(input_list: list[str],slice_size: int=chunck_size) -> list[list[str]]:
  return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]

In [None]:
for item in tqdm(pages_and_text):
  item["sentence_chunck"]=split_list(item['sentences'])
  item['num_chunck']=len(item["sentence_chunck"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
import random
random.sample(pages_and_text, k=1)

[{'Page number': 625,
  'Total Char': 1328,
  'Total words': 251,
  'Total Tokens': 332.0,
  'Content': 'Age Group  RDA (mg/day) UL (mg/day)  Infants (0–6 months)  200*  –  Infants (6–12 months)  260*  –  Children (1–3 years)  700  2,500  Children (4–8 years)  1,000  2,500  Children (9–13 years)  1,300  2,500  Adolescents (14–18 years)  1,300  2,500  Adults (19–50 years)  1,000  2,500  Adult females (50–71 years)  1,200  2,500  Adults, male & female (> 71 years) 1,200  2,500  * denotes Adequate Intake  Source: Ross AC, Manson JE, et al. The 2011 Report on Dietary  Reference Intakes for Calcium and Vitamin D from the Institute of  Medicine: What Clinicians Need to Know. J Clin Endocrinol Metab.  2011; 96(1), 53–8. http:/ /www.ncbi.nlm.nih.gov/pubmed/21118827.  Accessed October 10, 2017.  Dietary Sources of Calcium  In the typical American diet, calcium is obtained mostly from dairy  products, primarily cheese. A slice of cheddar or Swiss cheese  contains just over 200 milligrams of calc

In [None]:
df=pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,Page number,Total Char,Total words,Total Tokens,page_sentence_count_spacy,num_chunck
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,287.15,10.32,1.53
std,348.86,560.44,95.75,140.11,6.3,0.64
min,-41.0,0.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,401.31,15.0,2.0
max,1166.0,2308.0,429.0,577.0,28.0,3.0


In [None]:
import re

pages_and_chunks=[]
for item in tqdm(pages_and_text):
  for sentence in item['sentence_chunck']:
    chunk_dict={}
    chunk_dict['Page num']=item["Page number"]

    joined_chunk="".join(sentence).replace("  "," ").strip()
    joined_chunk=re.sub(r'\.([A-Z])', r'. \1', joined_chunk)
    chunk_dict['char count']=len(joined_chunk)
    chunk_dict['word count']=len([word for word in joined_chunk.split(" ")])
    chunk_dict['token count']=chunk_dict['char count']/4
    chunk_dict['sentence_chunk']=joined_chunk

    pages_and_chunks.append(chunk_dict)
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [None]:
random.sample(pages_and_chunks ,k=1)

[{'Page num': 24,
  'char count': 294,
  'word count': 28,
  'token count': 73.5,
  'sentence_chunk': 'National Institute on Drug Abuse (2017, March 23). Health Consequences of Drug Misuse. \xa0https:/ /www.drugabuse.gov/related-topics/health- consequences-drug-misuse. 7.\xa0National Sleep Foundation. Sleep Disorders. https:/ /sleepfoundation.org/sleep-disorders-problems 24 | Lifestyles and Nutrition'}]

In [None]:
df=pd.DataFrame(pages_and_chunks)
df.sample(5)

Unnamed: 0,Page num,char count,word count,token count,sentence_chunk
533,328,210,25,52.5,Omega-3 fatty acids. University of Maryland Me...
1013,651,365,62,91.25,Wakame Salad Seaweed Food Cooking by maxpixel....
1466,940,521,63,130.25,Increasing your daily activity and shedding ex...
567,348,188,23,47.0,Childhood Obesity. US Department of Health and...
860,542,1207,193,301.75,Age Group RDA Males and Females mg/day UL Infa...


In [None]:
min_count_token=30
pages_and_chunks_over_min_token=df[df['token count']>min_count_token].to_dict(orient='records')
random.sample(pages_and_chunks_over_min_token,k=2)

[{'Page num': 1143,
  'char count': 141,
  'word count': 22,
  'token count': 35.25,
  'sentence_chunk': 'Forty-seven states have licensure requirements for RDs and nutritionists. A few remaining states do not have laws Careers in Nutrition | 1143'},
 {'Page num': 657,
  'char count': 534,
  'word count': 71,
  'token count': 133.5,
  'sentence_chunk': 'Vegans are at higher risk for iron deficiency, but careful meal planning does prevent its development. Iron deficiency is the most common of all micronutrient deficiencies. Table 11.1 Enhancers and Inhibitors of Iron Absorption Enhancer Inhibitor Meat Phosphate Fish Calcium Poultry Tea Seafood Coffee Stomach acid Colas Soy protein High doses of minerals (antacids) Bran/fiber Phytates Oxalates Polyphenols Figure 11.3 Iron Absorption, Functions, and Loss http:/ /www.cdc.gov/nutrition/everyone/basics/ vitamins/iron.html. Iron | 657'}]

### Embedding


In [None]:
pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model= SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

sentences=[" I like machine learning. ","I like natural language processing. ", "I like dogs. "]

embedding=embedding_model.encode(sentences)
embedding_dict=dict(zip(sentences,embedding))

for sentence,embeddings in embedding_dict.items():
  print(f"Sentence: {sentence}")
  print(f"Embedding: {embeddings}")


In [None]:
embedding[0].shape

(768,)

In [None]:
for item in tqdm(pages_and_chunks_over_min_token):
  item["Embedding"]=embedding_model.encode(item['sentence_chunk'])


  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
pages_and_chunks_over_min_token[45]

In [None]:
text_chunk_and_embeddings=pd.DataFrame(pages_and_chunks_over_min_token)
text_chunk_and_embeddings.to_csv("Chunks with embedding.csv",index=False)

In [17]:
import random

import torch
import numpy as np
import pandas as pd

chunks_with_embedding_df=pd.read_csv("Chunks with embedding.csv")
chunks_with_embedding_df["Embedding"]=chunks_with_embedding_df["Embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

embeddings = torch.tensor(np.stack(chunks_with_embedding_df["Embedding"].tolist(), axis=0), dtype=torch.float32)

# Convert texts and embedding df to list of dicts
pages_and_chunks = chunks_with_embedding_df.to_dict(orient="records")

chunks_with_embedding_df

Unnamed: 0,Page num,char count,word count,token count,sentence_chunk,Embedding
0,-39,308,42,77.00,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,"[0.0674242601, 0.0902279988, -0.00509551819, -..."
1,-38,210,30,52.50,Human Nutrition: 2020 Edition by University of...,"[0.0552156493, 0.0592137985, -0.0166167784, -0..."
2,-37,766,114,191.50,Contents Preface University of Hawai‘i at Māno...,"[0.0279801469, 0.0339813307, -0.0206427258, 0...."
3,-36,941,142,235.25,Lifestyles and Nutrition University of Hawai‘i...,"[0.0682566613, 0.0381274484, -0.00846858509, -..."
4,-35,998,152,249.50,The Cardiovascular System University of Hawai‘...,"[0.0330264196, -0.00849771872, 0.00957152341, ..."
...,...,...,...,...,...,...
1675,1164,1305,176,326.25,Flashcard Images Note: Most images in the flas...,"[0.018562289, -0.0164279491, -0.0127046462, -0..."
1676,1164,375,51,93.75,Hazard Analysis Critical Control Points reused...,"[0.0334722102, -0.0570441186, 0.0151489163, -0..."
1677,1165,1286,173,321.50,ShareAlike 11. Organs reused “Pancreas Organ A...,"[0.0770514011, 0.00978544541, -0.0121817011, 0..."
1678,1165,410,59,102.50,Sucrose reused “Figure 03 02 05” by OpenStax B...,"[0.103045098, -0.0164702833, 0.00826842152, 0...."


In [18]:
embeddings.shape

torch.Size([1680, 768])

### RAG

In [19]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [20]:
from sentence_transformers import util,SentenceTransformer

embedding_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
query="good foods for protein"
print(f'Query: {query}')

query_embedding=embedding_model.encode(query)

top_k=util.semantic_search(query_embedding,embeddings, top_k=5)
top_k

Query: good foods for protein


[[{'corpus_id': 611, 'score': 0.7728913426399231},
  {'corpus_id': 616, 'score': 0.7647284865379333},
  {'corpus_id': 615, 'score': 0.6743333339691162},
  {'corpus_id': 620, 'score': 0.674252986907959},
  {'corpus_id': 617, 'score': 0.6633554100990295}]]

In [22]:
pages_and_chunks[611]

{'Page num': 411,
 'char count': 430,
 'word count': 68,
 'token count': 107.5,
 'sentence_chunk': 'Dietary Sources of Protein The protein food group consists of foods made from meat, seafood, poultry, eggs, soy, dry beans, peas, and seeds. According to the Harvard School of Public Health, “animal protein and vegetable protein probably have the same effects on health. It’s the protein package that’s likely to make a difference.”1 1.\xa0Protein: The Bottom Line. Harvard School of Public Proteins, Diet, and Personal Choices | 411',
 'Embedding': array([ 3.57393287e-02,  4.69983965e-02,  2.42604432e-03, -1.34758586e-02,
         4.41605747e-02,  1.58364989e-03, -5.75249717e-02,  7.43903071e-02,
        -2.55551897e-02, -5.65149188e-02, -2.50401404e-02,  1.29939138e-03,
         5.23344688e-02,  2.63163075e-02,  1.98241640e-02, -4.84023057e-03,
         1.22621385e-02,  6.03821538e-02,  2.70720199e-02,  2.33721621e-02,
        -3.14021818e-02, -5.45088341e-03,  1.16759324e-02,  2.25814022e

In [23]:
import textwrap

def print_wrapped(text, wrap_length=100):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [24]:
query = "good foods for protein"
print(f"Query: '{query}'\n")
print("Results:")
for i in top_k:
  for item in i:
      print(f"Score: {item['score']:.4f}")
      print("Text:")
      print_wrapped(pages_and_chunks[item['corpus_id']]["sentence_chunk"])
      print(f"Page number: {pages_and_chunks[item['corpus_id']]['Page num']}")
      print("\n")

Query: 'good foods for protein'

Results:
Score: 0.7729
Text:
Dietary Sources of Protein The protein food group consists of foods made from meat, seafood,
poultry, eggs, soy, dry beans, peas, and seeds. According to the Harvard School of Public Health,
“animal protein and vegetable protein probably have the same effects on health. It’s the protein
package that’s likely to make a difference.”1 1. Protein: The Bottom Line. Harvard School of Public
Proteins, Diet, and Personal Choices | 411
Page number: 411


Score: 0.7647
Text:
Additionally, a person should consume 8 ounces of cooked seafood every week (typically as two
4-ounce servings) to assure they are getting the healthy omega-3 fatty acids that have been linked
to a lower risk for heart disease. Another tip is choosing to eat dry beans, peas, or soy products
as a main dish. Some of the menu choices include chili with kidney and pinto beans, hummus on pita
bread, and black bean enchiladas. You could also enjoy nuts in a variety of w

### Semantic search pipeline

In [25]:
def retrieve_relevant_chunks(query: str,
                             embeddings: torch.tensor,
                             model: SentenceTransformer=embedding_model,
                             n_resources_to_return: int=5
                             ):

  query_embedding=model.encode(query)

  dot_scores=util.dot_score(query_embedding, embeddings)[0]

  score, indices=torch.topk(dot_scores, k=n_resources_to_return)

  return score, indices

def print_relevant_info(query: str,
                        embeddings: torch.tensor,
                        pages_and_chunks: list[dict]=pages_and_chunks,
                        model : SentenceTransformer=embedding_model,
                        n_resources_to_return: int=5
                        ):
  scores,indices =retrieve_relevant_chunks(query=query,
                                            embeddings=embeddings,
                                            n_resources_to_return=n_resources_to_return)
  for score, idx in zip(scores, indices):
        print(f"Score: {score:}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[idx]['Page num']}")
        print("\n")

In [28]:
query="foods high in fiber"
retrieve_relevant_chunks(query=query, embeddings=embeddings)
print_relevant_info(query=query, embeddings=embeddings)

Score: 0.6963629722595215
Text:
• Change it up a bit and experience the taste and satisfaction of other whole grains such as barley,
quinoa, and bulgur. • Eat snacks high in fiber, such as almonds, pistachios, raisins, and air-popped
popcorn. Add an artichoke and green peas to your dinner plate more 276 | Carbohydrates and Personal
Diet Choices
Page number: 276


Score: 0.6809899806976318
Text:
Dietary fiber is categorized as either water-soluble or insoluble. Some examples of soluble fibers
are inulin, pectin, and guar gum and they are found in peas, beans, oats, barley, and rye. Cellulose
and lignin are insoluble fibers and a few dietary sources of them are whole-grain foods, flax,
cauliflower, and avocados. Cellulose is the most abundant fiber in plants, making up the cell walls
and providing structure. Soluble fibers are more easily accessible to bacterial enzymes in the large
intestine so they can be broken down to a greater extent than insoluble fibers, but even some
breakdown of

### Loading LLM

In [2]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [3]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [4]:
!pip install bitsandbytes



In [8]:
access_token = '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'
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token = access_token)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token = access_token)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create a quantization config
# Note: requires !pip install bitsandbytes accelerate
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: flash attention 2 = faster attention mechanism
# Flash Attention 2 requires a GPU with a compute capability score of 8.0+ (Ampere, Ada Lovelace, Hopper and above): https://developer.nvidia.com/cuda-gpus
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa" # scaled dot product attention
print(f"Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use
#model_id = "google/gemma-7b-it"
model_id = model_id

# 3. Instantiate tokenizer (tokenizer turns text into tokens)
access_token = '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'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,token = access_token)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use as much memory as we can
                                                 attn_implementation=attn_implementation,
                                                 token = access_token
                                                 )

if not use_quantization_config:
    llm_model.to("cuda")

Using attention implementation: sdpa


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
print(f"Using attention implementation: {attn_implementation}")

Using attention implementation: sdpa


In [7]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [8]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [9]:
def get_model_mem_size(model: torch.nn.Module):
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate model sizes
    model_mem_bytes = mem_params + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5012344832, 'model_mem_mb': 4780.14, 'model_mem_gb': 4.67}

### Generating text with the LLM

In [10]:
input_text = "What are the macronutrients and what are their functions in the body?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
What are the macronutrients and what are their functions in the body?

Prompt (formatted):
<bos><start_of_turn>user
What are the macronutrients and what are their functions in the body?<end_of_turn>
<start_of_turn>model



In [11]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-2b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_w

In [12]:
%%time

# Tokenize the input text (turn it into numbers) and send it to the GPU
input_ids = tokenizer(prompt,
                      return_tensors="pt").to("cuda")

# Generate outputs from local LLM
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256)
print(f"Model output (tokens):\n{outputs[0]}\n")

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1841,    708,    573, 186809,
        184592,    578,   1212,    708,   1024,   7257,    575,    573,   2971,
        235336,    107,    108,    106,   2516,    108,  21404, 235269,   1517,
        235303, 235256,    476,  25497,    576,    573, 186809, 184592,    578,
          1024,   7257,    575,    573,   2971, 235292,    109,    688,  12298,
          1695, 184592,  66058,    109, 235287,   5231, 156615,  56227,  66058,
           108,    141, 235287,  34428,   4134,    604,    573,   2971, 235303,
        235256,   5999,    578,  29703, 235265,    108,    141, 235287, 110165,
         56227,    708,    573,   2971, 235303, 235256,   1872,   4303,    576,
          4134, 235269,   5199,   2290,   6915,   5640, 235265,    108,    141,
        235287,  25280,  72780,   3707,   2343,   2127, 235269,  38317, 235269,
           578,  84475, 235265,    108, 235287,   5231,  49471,  66058,    108,
           141, 2

In [13]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
What are the macronutrients and what are their functions in the body?<end_of_turn>
<start_of_turn>model
Sure, here's a breakdown of the macronutrients and their functions in the body:

**Macronutrients:**

* **Carbohydrates:**
    * Provide energy for the body's cells and tissues.
    * Carbohydrates are the body's main source of energy, especially during physical activity.
    * Complex carbohydrates include starches, fibers, and sugars.
* **Protein:**
    * Builds and repairs tissues, enzymes, and hormones.
    * Protein is essential for muscle growth and repair, as well as for the production of antibodies.
    * Complete proteins include animal products, legumes, nuts, and seeds.
* **Fat:**
    * Provides energy, insulation, and the production of hormones.
    * Healthy fats include olive oil, avocado, nuts, and seeds.
    * Trans fats can raise cholesterol levels and increase the risk of heart disease.

**Functions of Macronutri

In [14]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]

query_list = gpt4_questions + manual_questions
query_list

['What are the macronutrients, and what roles do they play in the human body?',
 'How do vitamins and minerals differ in their roles and importance for health?',
 'Describe the process of digestion and absorption of nutrients in the human body.',
 'What role does fibre play in digestion? Name five fibre containing foods.',
 'Explain the concept of energy balance and its importance in weight management.',
 'How often should infants be breastfed?',
 'What are symptoms of pellagra?',
 'How does saliva help with digestion?',
 'What is the RDI for protein per day?',
 'water soluble vitamins']

In [31]:
import random

query = random.choice(query_list)
print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_chunks(query=query,
                                              embeddings=embeddings)
scores, indices

Query: Describe the process of digestion and absorption of nutrients in the human body.


(tensor([0.7485, 0.7013, 0.6914, 0.6755, 0.6625]),
 tensor([125, 134, 380, 148, 681]))

### Augumenting prompt with context items

In [34]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    base_prompt = base_prompt.format(context=context,
                                     query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)

    return prompt

query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_chunks(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format our prompt
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: Describe the process of digestion and absorption of nutrients in the human body.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.

Example 

In [35]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # from 0 to 1 and the lower the value, the more deterministic the text, the higher the value, the more creative
                             do_sample=True, # whether or not to use sampling
                             max_new_tokens=256)

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\m{output_text.replace(prompt, '')}")

Query: Describe the process of digestion and absorption of nutrients in the human body.
RAG answer:\m<bos>Sure, here's a summary of the process of digestion and absorption of nutrients in the human body:

**Step 1: Food intake**
- When we feel hungry, our bodies send a message to our brains that we need food.
- This message triggers the digestive system to start working.
- The digestive process involves two key levels: mechanical and chemical digestion.

**Step 2: Mechanical digestion**
- The mouth, pharynx, and esophagus facilitate food preparation.
- The mouth secretes digestive juices that break down carbohydrates, the pharynx secretes mucus to facilitate the passage of food, and the esophagus pushes food from the mouth to the stomach.

**Step 3: Chemical digestion**
- The stomach contains specialized enzymes that break down proteins into smaller peptides.
- The small intestine contains even more powerful enzymes that break down carbohydrates, fats, and proteins into smaller molecul

### Funtioning the LLM model

In [38]:
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens:int=256,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # RETRIEVAL
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_chunks(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()

    # AUGMENTATION
    # Create the prompt and format it with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # GENERATION
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Decode the tokens into text
    output_text = tokenizer.decode(outputs[0])

    # Format the answer
    if format_answer_text:
        # Replace prompt and special tokens
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")

    # Only return the answer without context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [39]:
query = random.choice(query_list)
print(f"Query: {query}")
ask(query=query,
    temperature=0.2,
    return_answer_only=False)

Query: Describe the process of digestion and absorption of nutrients in the human body.


("Sure, here's a summary of the process of digestion and absorption of nutrients in the human body:\n\n**The process of digestion begins even before you put food into your mouth.** When you feel hungry, your body sends a message to your brain that it is time to eat. This triggers the release of digestive hormones from the pituitary gland, which stimulate the release of digestive enzymes from the salivary glands, pancreas, and liver.\n\n**The digestive system is one of the eleven organ systems of the human body.** It is composed of several hollow tube-shaped organs including the mouth, pharynx, esophagus, stomach, small intestine, large intestine (colon), rectum, and anus.\n\n**The digestive system functions on two levels:** mechanically to move and mix ingested food and chemically to break down large molecules. The smaller nutrient molecules can then be absorbed and processed by cells throughout the body for energy or used as building blocks for new cells.\n\n**The process of digestion