In [2]:
import os
import requests

# Get PDF document path
pdf_path = "In_human_anat_final.pdf"

In [3]:
import fitz
from tqdm.auto import tqdm

pdf_path = "ln_human_anat_final.pdf"

def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = [] 
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number-10,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_setence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -10,
  'page_char_count': 333,
  'page_word_count': 84,
  'page_setence_count_raw': 1,
  'page_token_count': 83.25,
  'text': 'LECTURE NOTES    For Nursing Students           Human Anatomy  and Physiology                  Nega Assefa  Alemaya University  Yosief Tsige  Jimma University      In collaboration with the Ethiopia Public Health Training Initiative, The Carter Center,  the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education    2003'},
 {'page_number': -9,
  'page_char_count': 1267,
  'page_word_count': 234,
  'page_setence_count_raw': 9,
  'page_token_count': 316.75,
  'text': 'Funded under USAID Cooperative Agreement No. 663-A-00-00-0358-00.      Produced in collaboration with the Ethiopia Public Health Training Initiative, The Carter  Center, the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education.                  Important Guidelines for Printing and Photocopying  Limited permission is granted free of charge to print or phot

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()


Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,text
0,-10,333,84,1,83.25,LECTURE NOTES For Nursing Students ...
1,-9,1267,234,9,316.75,Funded under USAID Cooperative Agreement No. 6...
2,-8,1093,203,10,273.25,Human Anatomy and Physiology i Preface Th...
3,-7,306,62,3,76.5,Human Anatomy and Physiology ii - Review qu...
4,-6,659,112,5,164.75,Human Anatomy and Physiology iii Acknowledgm...


In [7]:
from spacy.lang.en import English

nlp = English()
 
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1eedf7df190>

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/428 [00:00<?, ?it/s]

In [9]:
import random
random.sample(pages_and_texts, k=1)

[{'page_number': 290,
  'page_char_count': 1146,
  'page_word_count': 230,
  'page_setence_count_raw': 24,
  'page_token_count': 286.5,
  'text': 'Human Anatomy and Physiology  291    Review Questions    1. How does the color of blood vary with the amount of  oxygenation?  2. Name the three main purposes of blood.  3. Name the two prime components of blood.  4. Name four main ingredients of blood plasma. What are their  Purposes?  5. What is the main function of erythrocytes? Leukocytes?  Platelets?   6. What are the names usually given to the four main blood  groups? What determines the different groupings?  7. Describe the three basic steps involved in the clotting  process.  8. What are the three layers of the heart wall?  9. What is a partition in the heart called? Name two.  10. Name the chambers of the heart and tell what each does.  11. Name the valves of the heart and explain the purpose of  each valve.  12. Why the myocardium need its own blood supply? Name the  arteries that 

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,page_sentence_count_spacy
count,428.0,428.0,428.0,428.0,428.0,428.0
mean,203.5,972.77,191.65,8.77,243.19,8.7
std,123.7,370.1,66.41,5.35,92.52,5.45
min,-10.0,52.0,13.0,1.0,13.0,1.0
25%,96.75,695.5,151.0,4.0,173.88,4.0
50%,203.5,1067.0,201.0,9.0,266.75,9.0
75%,310.25,1268.25,235.25,12.0,317.06,12.0
max,417.0,1822.0,795.0,31.0,455.5,35.0


In [11]:
num_chunk_list_size = 10

def chunk_text(input_text: list[str], chunk_size: int= num_chunk_list_size) -> list[list[str]]:
    return [input_text[i: i+chunk_size] for i in range(0, len(input_text), chunk_size)]

In [12]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = chunk_text(item["sentences"])
    item["sentence_chunks_num"] = len(item["sentence_chunks"])

  0%|          | 0/428 [00:00<?, ?it/s]

In [13]:
rnd = random.sample(pages_and_texts, k=1)
rnd_chunk = rnd[0]["sentence_chunks"]
rnd_chunk


[['Human Anatomy and Physiology  323    The Small Intestine   The small intestine is the longest part of the digestive tract.',
  'It  is known as the small intestine because, although it is longer  than the large intestine, it is smaller in diameter, with an  average width of about 2.5 cm (1 inch).',
  'When relaxed to its  full length, the small intestine is about 6 m (20 feet) long.',
  'The  first 25 cm (10 inches) or so of the small intestine make up the  duodenum.',
  'Beyond the duodenum are two more divisions:  the jejunum, which forms the next two fifths of the small  intestine, and the ileum, which constitutes the remaining  portion.',
  '    The wall of the duodenum contains glands that secrete large  amounts or mucus to protect the small intestine from the  strongly acid chyme entering from the stomach.',
  'Cells of the  small intestine also secrete enzymes that digest proteins and  carbohydrates.',
  'In addition, digestive juices from the liver and  pancreas enter the sm

In [14]:
import re

page_chunks = []

for item in tqdm(pages_and_texts):
    for sentence in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_no"] = item["page_number"]

        joined_sentence = "".join(sentence).replace("  "," ").strip()
        joined_sentence = re.sub(r'\.([A-Z])', r'. \1', joined_sentence)

        chunk_dict["chunk_sentences"] = joined_sentence

        chunk_dict["character_count"] = len(joined_sentence)
        chunk_dict["word_count"] = len([word for word in joined_sentence.split(" ")])
        chunk_dict["token_count"] = len(joined_sentence)/4

        page_chunks.append(chunk_dict)

  0%|          | 0/428 [00:00<?, ?it/s]

In [15]:
random.sample(page_chunks, k=1)

[{'page_no': 79,
  'chunk_sentences': 'Human Anatomy and Physiology 80              Figure: 5.8 Skull anterior views, and the orbital cavity (source: Carola, R., Harley,J. P., Noback R. C., (1992), Human anatomy and physiology, Mc Graw hill inc, New York, 2 nd ed, pp 173)  The vertebral column  The vertebral column together with the sternum & ribs constitutes the skeleton of the trunk of the body. It composes 2/5th of the height of the body and has average length in male of 71 c.m.and in female 61 c.m. The adult vertebral column contains 26 vertebras. Prior to fusion of sacral & coccygeal vertebrae the total number is 33. It is a strong and flexible to either direction & rotated on it self. Encloses & protect spinal cord, supports the head and serves as a point of attachment for the ribs & muscles of the back.',
  'character_count': 782,
  'word_count': 153,
  'token_count': 195.5}]

In [16]:
df = pd.DataFrame(page_chunks)
min_token= 30
pages_chunks_over_min_token = df[df["token_count"] > min_token].to_dict(orient="records")
pages_chunks_over_min_token[:2]

[{'page_no': -10,
  'chunk_sentences': 'LECTURE NOTES  For Nursing Students      Human Anatomy and Physiology         Nega Assefa Alemaya University Yosief Tsige Jimma University   In collaboration with the Ethiopia Public Health Training Initiative, The Carter Center, the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education  2003',
  'character_count': 307,
  'word_count': 58,
  'token_count': 76.75},
 {'page_no': -9,
  'chunk_sentences': 'Funded under USAID Cooperative Agreement No.663-A-00-00-0358-00.   Produced in collaboration with the Ethiopia Public Health Training Initiative, The Carter Center, the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education.         Important Guidelines for Printing and Photocopying Limited permission is granted free of charge to print or photocopy all pages of this publication for educational, not-for-profit use by health care workers, students or faculty. All copies must retain all author credits and copyright notices i

In [22]:
text_chunks = [item["chunk_sentences"] for item in pages_chunks_over_min_token]
text_chunks

['LECTURE NOTES  For Nursing Students      Human Anatomy and Physiology         Nega Assefa Alemaya University Yosief Tsige Jimma University   In collaboration with the Ethiopia Public Health Training Initiative, The Carter Center, the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education  2003',
 'Funded under USAID Cooperative Agreement No.663-A-00-00-0358-00.   Produced in collaboration with the Ethiopia Public Health Training Initiative, The Carter Center, the Ethiopia Ministry of Health, and the Ethiopia Ministry of Education.         Important Guidelines for Printing and Photocopying Limited permission is granted free of charge to print or photocopy all pages of this publication for educational, not-for-profit use by health care workers, students or faculty. All copies must retain all author credits and copyright notices included in the original document. Under no circumstances is it permissible to sell or distribute on a commercial basis, or to claim authorship of,

In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

model.to("cuda")

text_chunk_embeddings = model.encode(text_chunks, batch_size=32, convert_to_tensor=True)



In [24]:
len(text_chunk_embeddings), len(text_chunks)

(559, 559)

In [19]:
import torch
print(torch.cuda.is_available())  

True
