In [2]:
import os
import pymupdf
import fitz
from PIL import Image
from IPython.display import display
import io
import re
import unicodedata
import pickle

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [4]:
#add a single document to database
#add a batch of documents to database
#retrieve name of documents in database

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # Try 500–1000 for LLM input
    chunk_overlap=200,     # Adds context to each chunk
    separators=["\n\n", "\n", ".", " ", ""]
)

vector_model = SentenceTransformer('all-MiniLM-L6-v2')

In [35]:
document_dir = 'documents/'
document_title = 'lamarsh_baratta-introduction_to_nuclear_engineering_textbook_3rd_edition.pdf'
#document_title = 'Nuclear Energy - R.Murray (2000).pdf'
test_doc = os.path.join(document_dir, document_title)

In [36]:
#1 extract text
text_list = []
pix_list = []

with fitz.open(test_doc) as doc:
    metadata =  doc.metadata
    
    for page in doc:
        text_list.append(page.get_text())
        #pix_list.append(page.get_pixmap())

In [6]:
def clean_pdf_text(raw_text):
    text = raw_text

    # 1. Normalize Unicode characters (e.g., accented letters to canonical form)
    text = unicodedata.normalize("NFKC", text)

    # 2. Replace common misencoded ligatures, symbols, and PDF artifacts
    replacements = {
        "ﬁ": "fi", "ﬂ": "fl", "ﬀ": "ff", "ﬃ": "ffi", "ﬄ": "ffl",
        "“": '"', "”": '"', "‘": "'", "’": "'",
        "—": "-", "–": "-", "•": "*",
        "…": "...", "±": "+/-",
        "µ": "micro",
        "′": "'", "″": '"',
        "fJ": "β", "y": "γ", "l0": "10",
        "\u00A0": " ",  # Non-breaking space
    }
    for k, v in replacements.items():
        text = text.replace(k, v)

    # 3. Remove control characters and invisible junk
    text = re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]", "", text)

    # 4. Fix broken words split across lines (hyphen + newline)
    text = re.sub(r'\xad\n', '', text)    # Soft hyphen + newline
    text = re.sub(r'-\n', '', text)       # Regular hyphen + newline
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # Single newlines → space

    # 5. Collapse multiple spaces and tabs
    text = re.sub(r'[ \t]+', ' ', text)

    # 6. Collapse multiple newlines (paragraph breaks)
    text = re.sub(r'\n{2,}', '\n\n', text)

    # 7. Remove page numbers (common pattern: digits alone on a line)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # 8. Remove header/footer repeated lines (if detected)
    lines = text.splitlines()
    if len(lines) > 2:
        # Naive dedupe of lines that appear too often
        from collections import Counter
        line_counts = Counter(lines)
        text = "\n".join(
            line for line in lines if line_counts[line] < len(lines) * 0.5
        )

    # 9. Strip leading/trailing spaces on each line
    text = "\n".join(line.strip() for line in text.splitlines())

    # 10. Final cleanup: remove extra blank lines at start/end
    text = text.strip()

    return text

In [7]:
# clean text
text_list[105]
cleaned_text = clean_pdf_text(text_list[105])
cleaned_text

NameError: name 'text_list' is not defined

In [157]:
#chunk text by page

In [8]:
#build dictionary of chunks
def save_vectors(text_list, document_title, folder_path):
    vector_library = os.path.join(folder_path, "library.pkl")
    
    if os.path.exists(vector_library):
        with open(vector_library, "rb") as f:
            existing_library = pickle.load(f)

    else:
        existing_library = []

    existing_text = [entry['text'] for entry in existing_library]

    book = []
    chunk_count = 0

    for i, page in enumerate(text_list):  
        cleaned_page = clean_pdf_text(page)
        page_chunks = text_splitter.split_text(cleaned_page)  
    
        for j, chunk in enumerate(page_chunks):
        
            if chunk in existing_text:
                continue
        
            chunk_count += 1
            vector = vector_model.encode(chunk)
        
            entry = {
                "title" : document_title,
                "chunk_number" : chunk_count,
                "page" : i,
                "text" : chunk,
                "vector" : vector
            }

            book.append(entry)

    updated_library = existing_library + book

    with open(vector_library, "wb") as f:
        pickle.dump(updated_library, f)

In [9]:
def return_titles(library):
    unique_titles = list(set(item['title'] for item in library))
    return unique_titles

In [10]:
def add_pdf(pdf_path):
    text_list = []

    with fitz.open(pdf_path) as doc:
    
        for page in doc:
            text_list.append(page.get_text())

    document_title = os.path.basename(pdf_path)
    save_vectors(text_list, document_title)

In [11]:
def add_set(folder_path):
    
    for document_title in os.listdir(folder_path):
        document_path = os.path.join(folder_path, document_title)

        text_list = []
        
        with fitz.open(document_path) as doc:

            for page in doc:
                text_list.append(page.get_text())

        save_vectors(text_list, document_title, folder_path)

In [12]:
add_set('documents/unstructured/world_of_warcraft')

In [13]:
with open("documents/unstructured/world_of_warcraft/library.pkl", "rb") as f:
    library = pickle.load(f)

In [14]:
return_titles(library)

['World of Warcraft 2nd Edition.pdf']

In [15]:
len(library)

2185