# Import Libraries and files 

In [1]:
# Basic Libraries
import os
import pandas as pd
import numpy as np

# Pre-processing
import fitz # install PyMuPDF
import requests
from tqdm import tqdm
from spacy.lang.en import English 
import re

# Embeddings
from sentence_transformers import util, SentenceTransformer

# RAG
import torch
import textwrap

In [2]:
# Get PDF document path
filename = "human-nutrition-text.pdf"

# Download PDF
if not os.path.exists(filename):
    print("File doesn't exist, downloading...")
    
    # The URL of the PDF you want to download
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # Send a GET request to the url
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file and save it
        with open(filename,"wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been download and saved as {filename}")
    else: print(f"[INFO] Failed to download the file. Status Code: {response.status_code}")
        
else:
  print(f"File {filename} exists.")

File human-nutrition-text.pdf exists.


# Read Files

In [3]:
# Format text read from PDF
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n"," ").strip()
    return cleaned_text

# Provide pdf name
def open_and_read_pdf(filename: str)->list[dict]:
    doc = fitz.open(filename)
    pages_and_texts=[]
    for page_number,page in tqdm(enumerate(doc)):
        text = page.get_text()
        formatted_text = text_formatter(text)

        # store pages information and texts in a dictionary
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                        "page_char_count": len(text),
                        "page_word_count": len(text.split(" ")),
                        "page_sentence_count_raw": len(text.split(". ")),
                        # "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                        "text": text})
        
    return pages_and_texts

def read_pages(pages,page_number:int):
    # insert pages_and_text and page number to retrieve the specified page in the pdf
    content  = pages[page_number+41]
    return content

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Text Processing

In [4]:
def text_processor(pages_and_texts,num_sentence_chunk_size=10,min_token_length=30):
    nlp = English()
    # Add a sentencizer pipeline
    nlp.add_pipe("sentencizer")

    # Sentenciaing texts on each page to sentences
    for item in tqdm(pages_and_texts):
        item["sentences"] = list(nlp(item["text"]).sents)
        
        # Make sure all sentences are strings
        item["sentences"] = [str(sentence) for sentence in item["sentences"]]
        
        # Count the sentences 
        item["page_sentence_count_spacy"] = len(item["sentences"])

    # Loop through pages and texts and split sentences into chunks
    for item in tqdm(pages_and_texts):
        item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                            slice_size=num_sentence_chunk_size)
        item["num_chunks"] = len(item["sentence_chunks"])

    # Split each chunk into its own item
    pages_and_chunks = []
    for item in tqdm(pages_and_texts):
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["page_number"]
            
            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
            chunk_dict["sentence_chunk"] = joined_sentence_chunk

            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
            chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
            
            pages_and_chunks.append(chunk_dict)

    df = pd.DataFrame(pages_and_chunks)
    pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
    return pages_and_chunks, pages_and_chunks_over_min_token_len

# Embedding

In [5]:
def embedding(texts,model_name="all-mpnet-base-v2",device="cuda"):

    # use "all-mpnet-base-v2"
    embedding_model = SentenceTransformer(model_name_or_path=model_name, 
                                        device=device)
    # # Send the model to the GPU
    # embedding_model.to("cuda") 

    # Create embeddings one by one on the GPU
    # pages_and_chunks_over_min_token_len
    for item in tqdm(texts):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])

    # # Turn text chunks into a single list
    # text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

    # # Embed all texts in batches (z-book: 34.4s)
    # text_chunk_embeddings = embedding_model.encode(text_chunks,
    #                                                batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
    #                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

    # Save embeddings to file
    text_chunks_and_embeddings_df = pd.DataFrame(texts)
    embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
    text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)
    return text_chunks_and_embeddings_df

In [12]:
# Define embedding model
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device='cuda')

In [6]:
def read_embedding_from_file(file='text_chunks_and_embeddings_df.csv',device='cuda'):
    # Import texts and embedding df
    text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")
    # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
    text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

    # Convert texts and embedding df to list of dicts
    pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

    # Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
    embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
    return pages_and_chunks,embeddings

# Retrieval

In [7]:
# Define helper function to print wrapped text 
def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [10]:
def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

In [13]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Pass user query into the function,
    Embeds a query with model and returns top k (default=5) scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores based on embeddings passed into the function
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    """
    Return top 5 results based on dot product scores    
    """
    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

# Generation

In [8]:
# Pre-processing
pages_and_texts = open_and_read_pdf("human-nutrition-text.pdf")
pages_and_chunks,pages_and_chunks_over_min_token_len = text_processor(pages_and_texts)
read_pages(pages_and_chunks_over_min_token_len,258)

1208it [00:01, 783.56it/s]
100%|██████████| 1208/1208 [00:02<00:00, 571.75it/s]
100%|██████████| 1208/1208 [00:00<?, ?it/s]
100%|██████████| 1208/1208 [00:00<00:00, 38714.48it/s]


{'page_number': 196,
 'sentence_chunk': 'Potassium also is involved in protein synthesis, energy metabolism, \nand platelet function, and acts as a buffer in blood, playing a role in \nacid-base balance.\nImbalances of Potassium \nInsufficient potassium levels in the body (hypokalemia) can be \ncaused by a low dietary intake of potassium or by high sodium \nintakes, but more commonly it results from medications that \nincrease water excretion, mainly diuretics. The signs and symptoms \nof hypokalemia are related to the functions of potassium in nerve \ncells and consequently skeletal and smooth-muscle contraction.\nThe signs and symptoms include muscle weakness and cramps, \nrespiratory distress, and constipation. Severe potassium depletion \ncan cause the heart to have abnormal contractions and can even \nbe fatal. High levels of potassium in the blood, or hyperkalemia, \nalso affects the heart. It is a silent condition as it often displays \nno signs or symptoms. Extremely high level

In [9]:
# Read embedding from csv file
pages_and_chunks,embeddings = read_embedding_from_file()

### Checking local GPU memory availability

Let's find out what hardware we've got available and see what kind of model(s) we'll be able to load.

> **Note:** You can also check this with the `!nvidia-smi` command.

In [16]:
    # Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 4 GB
