Document Preprocessing and Embedding

In [2]:
import os 
import requests
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[Info] File doesn't exist , downloading ...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    file_name = pdf_path
    response = requests.get(url)
    if response.status_code == 200 : 
        with open(file_name , "wb") as file  : 
            file.write(response.content)
        print(f"[INFO] the file has been downloaded and saved as {file_name}")
    else : 
        print(f"[INFO Failed to download the file . Status Code : {response.status_code} ]")
else : 
    print(f"[INFO] file already exists")

[INFO] file already exists


In [3]:
import fitz
from tqdm import tqdm

def text_formatter(text:str) -> str : 
    cleaned_text = text.replace("\n"," ").strip()
    return cleaned_text
def open_read_pdf(pdf_path:str)->list[dict] : 
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number , page in tqdm(enumerate(doc)) : 
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number" : page_number, 
                                "page_char_count" : len(text),                ## for exploratory data analysis
                                "page_word_count" : len(text.split(" ")),     ## for exploratory data analysis
                                "page_sentence_count" : len(text.split(".")), ## for exploratory data analysis
                                "page_token_count" : len(text)/4, ## every token consists of 4 chars              
                                "text":text})
        
    return pages_and_texts
pages_and_texts = open_read_pdf(pdf_path)
pages_and_texts[50]

1208it [00:01, 688.76it/s]


{'page_number': 50,
 'page_char_count': 1320,
 'page_word_count': 215,
 'page_sentence_count': 4,
 'page_token_count': 330.0,
 'text': 'Minerals  Major Functions  Macro  Sodium  Fluid balance, nerve transmission, muscle contraction  Chloride  Fluid balance, stomach acid production  Potassium  Fluid balance, nerve transmission, muscle contraction  Calcium  Bone and teeth health maintenance, nerve transmission,  muscle contraction, blood clotting  Phosphorus  Bone and teeth health maintenance, acid-base balance  Magnesium  Protein production, nerve transmission, muscle  contraction  Sulfur  Protein production  Trace  Iron  Carries oxygen, assists in energy production  Zinc  Protein and DNA production, wound healing, growth,  immune system function  Iodine  Thyroid hormone production, growth, metabolism  Selenium  Antioxidant  Copper  Coenzyme, iron metabolism  Manganese  Coenzyme  Fluoride  Bone and teeth health maintenance, tooth decay  prevention  Chromium  Assists insulin in glucose m

In [4]:
import pandas as pd 
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,29,4,1,7.25,Human Nutrition: 2020 Edition
1,1,0,1,1,0.0,
2,2,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,3,212,32,3,53.0,Human Nutrition: 2020 Edition by University of...
4,4,797,145,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.0,1148.0,198.0,14.0,287.0
std,349.0,560.0,96.0,10.0,140.0
min,0.0,0.0,1.0,1.0,0.0
25%,302.0,762.0,134.0,8.0,190.0
50%,604.0,1232.0,214.0,13.0,308.0
75%,905.0,1604.0,271.0,19.0,401.0
max,1207.0,2308.0,429.0,82.0,577.0


Splitting pages into sentences

In [6]:
from spacy.lang.en import English
nlp = English()
## adding spacy sentencizer ==> split text into sentences (link to the doc : https://spacy.io/api/sentencizer)
sentencizer = nlp.add_pipe("sentencizer")
## doc instance for demonstration 
doc = nlp("this is a senteence . this is another one . A third one !")
assert len(list(doc.sents)) == 3
list(doc.sents)

[this is a senteence ., this is another one ., A third one !]

In [7]:
pages_and_texts[0]

{'page_number': 0,
 'page_char_count': 29,
 'page_word_count': 4,
 'page_sentence_count': 1,
 'page_token_count': 7.25,
 'text': 'Human Nutrition: 2020 Edition'}

In [8]:
for item in tqdm(pages_and_texts) : 
    item["sentences"] = list(nlp(item["text"]).sents) ## passing the text of each page to the sentencizer 
    item["sentences"] = [str(sentences) for sentences in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"]) 

  0%|          | 0/1208 [00:00<?, ?it/s]

100%|██████████| 1208/1208 [00:03<00:00, 383.54it/s]


In [9]:
pages_and_texts[1098]

{'page_number': 1098,
 'page_char_count': 1841,
 'page_word_count': 307,
 'page_sentence_count': 26,
 'page_token_count': 460.25,
 'text': 'harmful microorganisms that can cause foodborne illnesses.  Therefore, people who primarily eat raw foods should thoroughly  clean all fruit and vegetables before eating them. Poultry and other  meats should always be cooked before eating.12  Vegetarian and Vegan Diets  Vegetarian and vegan diets have been followed for thousands of  years for different reasons, including as part of a spiritual practice,  to show respect for living things, for health reasons, or because of  environmental concerns. For many people, being a vegetarian is a  logical outgrowth of “thinking green.” A meat-based food system  requires more energy, land, and water resources than a plant-based  food system. This may suggest that the plant-based diet is more  sustainable than the average meat-based diet in the U.S.By avoiding  animal flesh, vegetarians hope to look after thei