### Import PDF Document

In [1]:
import os
import requests

#get PDF document path
pdf_path = "human-nutrition-text.pdf"

#download pdf
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    #enter URL of pdf
    url="https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    #The local file name to save file
    filename = pdf_path

    #send a GET request to URL
    response = requests.get(url)

    #check if req successful
    if response.status_code == 200:
        #open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] the file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] failed to download file")
else:
    print(f"File {pdf_path} exists")

File human-nutrition-text.pdf exists


Opening pdf using pymupdf library

In [2]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str)->str:
    """minor text formatting"""
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text
    
def open_and_read_pdf(pdf_path: str)->list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41,
                               "page_char_count": len(text),
                               "word_count": len(text.split(" ")),
                               "page_sentence_count_raw": len(text.split(". ")),
                               "page_token_count": len(text)/4,
                               "text": text
                              })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 1130,
  'page_char_count': 781,
  'word_count': 126,
  'page_sentence_count_raw': 5,
  'page_token_count': 195.25,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=568    1130  |  Undernutrition, Overnutrition, and Malnutrition'},
 {'page_number': 787,
  'page_char_count': 1366,
  'word_count': 260,
  

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [6]:
#split pages into sentences.
from spacy.lang.en import English

nlp = English()

#add a sentencizer pipeline
nlp.add_pipe("sentencizer")

#create a document instance
doc = nlp("This is a sentence. This is another sentence. I love Tharuniyaa.")

assert len(list(doc.sents)) == 3

list(doc.sents)

[This is a sentence., This is another sentence., I love Tharuniyaa.]

In [7]:
pages_and_texts[69]

{'page_number': 28,
 'page_char_count': 966,
 'word_count': 163,
 'page_sentence_count_raw': 11,
 'page_token_count': 241.5,
 'text': '3. Ovo-vegetarian. This type of vegetarian diet includes eggs but  not dairy products.  4. Vegan. This type of vegetarian diet does not include dairy,  eggs, or any type of animal product or animal by-product.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it onl

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_texts, k = 1)

[{'page_number': 881,
  'page_char_count': 1666,
  'word_count': 286,
  'page_sentence_count_raw': 21,
  'page_token_count': 416.5,
  'text': 'example. Children who suffer from this condition experience an  adverse reaction to the lactose in milk products. It is a result of  the small intestine’s inability to produce enough of the enzyme  lactase, which is produced by the small intestine. Symptoms of  lactose intolerance usually affect the GI tract and can include  bloating, abdominal pain, gas, nausea, and diarrhea. An intolerance  is best managed by making dietary changes and avoiding any foods  that trigger the reaction.8  The Threat of Lead Toxicity  There is a danger of lead toxicity, or lead poisoning, among school- aged children. Lead is found in plumbing in old homes, in lead-based  paint, and occasionally in the soil. Contaminated food and water can  increase exposure and result in hazardous lead levels in the blood.  Children under age six are especially vulnerable. They may 

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round()

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.0,1148.0,199.0,11.0,287.0,10.0
std,349.0,560.0,96.0,7.0,140.0,6.0
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,261.0,762.0,134.0,5.0,190.0,5.0
50%,562.0,1232.0,216.0,10.0,308.0,10.0
75%,864.0,1604.0,272.0,15.0,401.0,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [11]:
### chunking text
num_sentences_chunk_size = int(df["page_sentence_count_spacy"].mean())

#recursively split lists of texts to chunk sized lists

def split_list(input_list: list[str],
               slice_size: int)-> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))

split_list(test_list, num_sentences_chunk_size)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [12]:
#loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list = item["sentences"], slice_size = num_sentences_chunk_size)

    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
random.sample(pages_and_texts, k=1)

[{'page_number': 88,
  'page_char_count': 1681,
  'word_count': 300,
  'page_sentence_count_raw': 13,
  'page_token_count': 420.25,
  'text': 'the iron-containing hemoglobin molecule in red blood cells serves  as the oxygen carrier.  Wastes Out  In the metabolism of macronutrients to energy, cells produce the  waste products carbon dioxide and water. As blood travels through  smaller and smaller vessels, the rate of blood flow is dramatically  reduced, allowing for efficient exchange of nutrients and oxygen for  cellular waste products through tiny capillaries. The kidneys remove  any excess water from the blood, and blood delivers the carbon  dioxide to the lungs where it is exhaled. Also, the liver produces  the waste product urea from the breakdown of amino acids and  detoxifies many harmful substances, all of which require transport  in the blood to the kidneys for excretion.  All for One, One for All  The eleven organ systems in the body completely depend on each  other for contin

In [14]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


### Splitting each chunk into it's own item

In [15]:
import re

#split each chunk into it's own item

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        #join the sentences together into a paragraph-like structure

        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r' \1', joined_sentence_chunk)


        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)
'''
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    current_chunk = ""
    current_chunk_token_count = 0
    current_chunk_info = {"page_number": item["page_number"],
                         "sentence_chunk": "",
                         "chunk_char_count": 0,
                         "chunk_word_count": 0,
                         "chunk_token_count": 0}
    for sentence_chunk in item["sentence_chunks"]:
        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r' \1', joined_sentence_chunk)
        chunk_token_count = len(joined_sentence_chunk) / 4

        if current_chunk_token_count + chunk_token_count > 384:
            current_chunk_info["sentence_chunk"] = current_chunk.strip()
            current_chunk_info["chunk_char_count"] = len(current_chunk)
            current_chunk_info["chunk_word_count"] = len(current_chunk.split(" "))
            current_chunk_info["chunk_token_count"] = current_chunk_token_count
            pages_and_chunks.append(current_chunk_info)

            current_chunk = joined_sentence_chunk
            current_chunk_token_count = chunk_token_count
            current_chunk_info  = {"page_number": item["page_number"],
                         "sentence_chunk": "",
                         "chunk_char_count": 0,
                         "chunk_word_count": 0,
                         "chunk_token_count": 0}
        else:
            current_chunk += " " + joined_sentence_chunk
            current_chunk_token_count += chunk_token_count

    if current_chunk:
        current_chunk_info["sentence_chunk"] = current_chunk.strip()
        current_chunk_info["chunk_char_count"] = len(current_chunk)
        current_chunk_info["chunk_word_count"] = len(current_chunk.split(" "))
        current_chunk_info["chunk_token_count"] = current_chunk_token_count
        pages_and_chunks.append(current_chunk_info)
'''

  0%|          | 0/1208 [00:00<?, ?it/s]

'\npages_and_chunks = []\n\nfor item in tqdm(pages_and_texts):\n    current_chunk = ""\n    current_chunk_token_count = 0\n    current_chunk_info = {"page_number": item["page_number"],\n                         "sentence_chunk": "",\n                         "chunk_char_count": 0,\n                         "chunk_word_count": 0,\n                         "chunk_token_count": 0}\n    for sentence_chunk in item["sentence_chunks"]:\n        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()\n        joined_sentence_chunk = re.sub(r\'\\.([A-Z])\', r\' \x01\', joined_sentence_chunk)\n        chunk_token_count = len(joined_sentence_chunk) / 4\n\n        if current_chunk_token_count + chunk_token_count > 384:\n            current_chunk_info["sentence_chunk"] = current_chunk.strip()\n            current_chunk_info["chunk_char_count"] = len(current_chunk)\n            current_chunk_info["chunk_word_count"] = len(current_chunk.split(" "))\n            current_chunk_info["c

In [16]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 758,
  'sentence_chunk': 'delivery of regionally grown farm produce to community institutions, farmers markets, and individuals. 4 Encourage worksites, medical centers, universities, and other community and business establishments to serve more fruits and vegetables in cafeterias and onsite eateries. 5 Support schools in developing healthy food messages to students by incorporating activities such as gardening into curricula. 6 Encourage the development and support of community and home gardens. 7 Have emergency food programs, including food banks and food rescue programs, increase their supply of fruits and vegetables. The seven strategies developed by the CDC are based on the idea that improving access to and availability of fruits and vegetables will lead to an increase in their consumption.',
  'chunk_char_count': 779,
  'chunk_word_count': 114,
  'chunk_token_count': 194.75}]

In [17]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,730.55,112.74,182.64
std,347.79,445.59,71.24,111.4
min,-41.0,12.0,3.0,3.0
25%,280.5,313.0,45.0,78.25
50%,586.0,743.0,115.0,185.75
75%,890.0,1112.0,173.0,278.0
max,1166.0,1823.0,297.0,455.75


In [18]:
# Show random chunks with under 30 tokens in length
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

chunk token count: 18.75 | Text: Published February 6, 2018. Accessed April 15, 2018. Comparing Diets | 1055
chunk token count: 11.75 | Text: Polan EU, Taylor DR. (2003), 782 | Introduction
chunk token count: 27.0 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=165 226 | Popular Beverage Choices
chunk token count: 6.5 | Text: Fat-Soluble Vitamins | 537
chunk token count: 25.0 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=521  996 | The Major Types of Foodborne Illness


In [19]:
#filter out df for rows under 30 tokens

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding text chunks

In [20]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device="cpu")
# create a list of sentences
sentences = ["Transformers library provides an easy way of embedding.",
            "Sentences can be embedded one by one or in a list.",
            "I like horses!"]

embeddings = embedding_model.encode(sentences)

embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f'Sentence: {sentence}')
    print(f'Embedding: {embedding}')



Sentence: Transformers library provides an easy way of embedding.
Embedding: [-5.57340011e-02  6.40663179e-03  4.91124252e-03  2.10787468e-02
  2.60457527e-02 -3.93987866e-03  2.07133256e-02  1.94761076e-03
  1.50001450e-02 -5.51366583e-02  2.70182043e-02  6.25496283e-02
 -3.87967750e-02  1.28000220e-02  3.05092148e-02 -6.30558804e-02
  1.99427288e-02  1.75520759e-02 -4.22759317e-02 -2.30977051e-02
 -1.51005005e-02 -3.94923054e-03  7.30106533e-02 -1.42055079e-02
  4.02332330e-03  2.79319994e-02 -4.20499593e-02 -4.00918648e-02
  4.42362614e-02  3.44449957e-03 -1.98470131e-02 -1.96271986e-02
  7.59227425e-02  6.29687356e-03  1.36537187e-06  9.31025203e-03
 -2.30744444e-02  2.87997536e-02  1.91687066e-02  2.41094120e-02
  5.69810271e-02 -8.54929164e-03 -2.02619471e-02  1.32503612e-02
 -9.55634937e-03 -1.86347850e-02  4.25398462e-02  6.16819132e-03
  8.44275653e-02 -3.61137800e-02 -1.27371456e-02 -2.70189848e-02
  7.39516038e-03  1.82170682e-02  8.67250666e-04  2.67479476e-02
 -5.63373370e

In [21]:
embeddings[0].shape

(768,)

In [22]:
embedding = embedding_model.encode("My favourite animal is the cow")
embedding

array([-8.87764711e-03,  8.35835114e-02, -2.81862803e-02, -3.71655039e-02,
        2.18684189e-02,  5.61196283e-02, -7.55760372e-02,  1.01799155e-02,
        1.48691228e-02, -2.08466798e-02, -2.89396364e-02,  4.55776379e-02,
       -3.03733237e-02, -1.41894128e-02, -1.61682982e-02, -3.85182947e-02,
        3.52857485e-02,  5.25924517e-03, -2.22115181e-02,  3.23855244e-02,
       -2.36056726e-02,  4.16592658e-02, -1.16740977e-02, -2.24949457e-02,
       -1.69876087e-02,  8.03107116e-03, -3.82865430e-03, -2.72515453e-02,
        2.71817148e-02,  2.64698267e-02, -6.16850033e-02, -8.03409293e-02,
        2.93563437e-02, -2.60433536e-02,  1.26088594e-06,  7.68714305e-03,
       -2.78717466e-02,  5.80591755e-03,  4.93111610e-02, -4.50255349e-03,
        3.92271951e-03,  1.44862225e-02, -1.33295488e-02,  1.35614304e-02,
        1.89375672e-02,  6.01764061e-02,  4.52522524e-02,  1.80459451e-02,
       -9.52361450e-02,  2.16227174e-02, -3.90326465e-03, -2.29934510e-02,
       -3.01052202e-02, -

In [23]:
%%time

##embedding_model.to("cpu")

##for item in tqdm(pages_and_chunks_over_min_token_len):
    ##item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: total: 0 ns
Wall time: 0 ns


In [24]:
import torch
print(torch.cuda.is_available())

True


In [26]:
%%time

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: total: 4min 59s
Wall time: 51.8 s


In [28]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

CPU times: total: 0 ns
Wall time: 0 ns


'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations Adding

In [30]:
%%time

#embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32,
                                              convert_to_tensor=True)
text_chunk_embeddings

CPU times: total: 1min 6s
Wall time: 18 s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0279,  0.0333, -0.0215,  ..., -0.0050,  0.0206,  0.0317],
        ...,
        [ 0.0728,  0.0188, -0.0052,  ..., -0.0439, -0.0649, -0.0209],
        [ 0.0964, -0.0227,  0.0104,  ..., -0.0536, -0.0252, -0.0280],
        [ 0.0417, -0.0102, -0.0227,  ..., -0.0302, -0.0252, -0.0262]],
       device='cuda:0')

In [31]:
pages_and_chunks_over_min_token_len[419]

{'page_number': 277,
 'sentence_chunk': 'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to

In [32]:
#save embeddings to a file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [33]:
#import saved csv file
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242899e-02 9.02281404e-02 -5.09547861e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156307e-02 5.92139401e-02 -1.66167300e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,765,116,191.25,[ 2.79099271e-02 3.32786553e-02 -2.14719474e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,939,144,234.75,[ 6.63120821e-02 4.21607494e-02 -8.27067625e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264568e-02 -8.49767309e-03 9.57160536e-...


For larger embeddings try using a vector database for storage instead of torch.tensor