In [1]:
# First, ensure you have the necessary library installed. Run this cell.
!pip install PyMuPDF

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Now, run this cell to extract the text from all your PDF files.
import fitz  # This is the PyMuPDF library
import os

# --- Configuration ---
# List of the PDF files in your Jupyter directory
pdf_filenames = [
    "privrulepd.pdf",                          # HIPAA Rule
    "ccpa_statute.pdf",                        # CCPA Statute
    "CELEX_02016R0679-20160504_EN_TXT.pdf"   # GDPR Text
]

# --- Function Definition ---
def extract_text_from_pdf(pdf_path):
    """
    Opens a PDF file and extracts the full text content from all pages.
    """
    try:
        doc = fitz.open(pdf_path)
        full_text = ""
        # Iterate through each page of the PDF
        for page in doc:
            full_text += page.get_text()
        doc.close()
        return full_text
    except Exception as e:
        print(f"Error processing file {pdf_path}: {e}")
        return ""

# --- Main Execution Logic ---
# A dictionary to store the extracted text, with filenames as keys
extracted_texts = {}

print("Starting text extraction process...")

# Loop through each filename, extract text, and store it
for filename in pdf_filenames:
    if os.path.exists(filename):
        print(f"-> Processing: {filename}...")
        extracted_texts[filename] = extract_text_from_pdf(filename)
    else:
        print(f"-> File not found: {filename}. Please check the name.")

print("\nExtraction complete!")

# --- Verification Step ---
# Print the first 500 characters of each extracted document to verify
print("-" * 20)
for filename, text in extracted_texts.items():
    print(f"\n--- Sample from: {filename} ---")
    if text:
        print(text[:500] + "...")
    else:
        print("No text was extracted.")
    print("-" * 20)

Starting text extraction process...
-> Processing: privrulepd.pdf...
MuPDF error: format error: cmsOpenProfileFromMem failed

-> Processing: ccpa_statute.pdf...
-> Processing: CELEX_02016R0679-20160504_EN_TXT.pdf...

Extraction complete!
--------------------

--- Sample from: privrulepd.pdf ---
VerDate Aug<2,>2002 
19:04 Aug 13, 2002
Jkt 197001
PO 00000
Frm 00001
Fmt 4717
Sfmt 4717
E:\FR\FM\14AUR4.SGM
pfrm17
PsN: 14AUR4"1985 National Archives and Records Administration" seal
Wednesday, 
August 14, 2002 
Part V 
Department of 
Health and Human 
Services 
Office of the Secretary 
45 CFR Parts 160 and 164 
Standards for Privacy of Individually 
Identifiable Health Information; Final 
Rule 
 
 
 
VerDate Aug<2,>2002 
19:04 Aug 13, 2002
Jkt 197001
PO 00000
Frm 00002
Fmt 4701
Sfmt 4700
E:\FR...
--------------------

--- Sample from: ccpa_statute.pdf ---
Page 1 of 65 
CALIFORNIA CONSUMER PRIVACY ACT OF 2018 
effective 01/01/2025 – SB 1223, AB 1008, AB 1824 update 
posted to cppa.ca.gov Januar

In [3]:
import re
import json

# --- Cleaning Function Definitions ---

def clean_hipaa_text(text):
    """Applies specific cleaning rules for the HIPAA PDF (privrulepd.pdf)."""
    # Remove Federal Register headers and footers
    text = re.sub(r'Federal Register / Vol\. \d+, No\. \d+ /.+', '', text)
    # Remove garbled extraction metadata (e.g., VerDate, Jkt, Fmt)
    text = re.sub(r'VerDate.+', '', text)
    # Remove page numbers that are on their own line
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    # Remove content from the first page's seal and header
    text = re.sub(r'ARCHIVES AND RECORDS.*ADMINISTRATION', '', text, flags=re.DOTALL)
    text = re.sub(r'Wednesday,.*Rule', '', text, flags=re.DOTALL)
    # A more general pattern to catch file metadata lines
    text = re.sub(r'E:\\FR\\FM\\.+', '', text)
    return text

def clean_ccpa_text(text):
    """Applies specific cleaning rules for the CCPA PDF (ccpa_statute.pdf)."""
    # Remove "Page X of 65" footers
    text = re.sub(r'Page \d+ of \d+', '', text)
    # Remove the initial title and contents section
    text = re.sub(r'CALIFORNIA CONSUMER PRIVACY ACT OF 2018.*?(?=1798\.100\.)', '', text, flags=re.DOTALL)
    return text

def clean_gdpr_text(text):
    """Applies specific cleaning rules for the GDPR PDF (CELEX...)."""
    # Remove the disclaimer header on the first page
    text = re.sub(r'This text is meant purely as a documentation tool.*?embedded in this document', '', text, flags=re.DOTALL)
    # Remove regulation code headers
    text = re.sub(r'02016R0679-EN-.*?\d+-\d+', '', text)
    # Remove artifacts like ►B, ▼B, ►C1 etc.
    text = re.sub(r'►[A-Z]\d*|▼[A-Z]\d*', '', text)
    return text

def clean_general_text(text):
    """Applies general cleaning rules to any text."""
    # Normalize whitespace: replace multiple spaces/tabs with a single space
    text = re.sub(r'[ \t]+', ' ', text)
    # Replace multiple newlines with a double newline to preserve paragraph structure
    text = re.sub(r'\n\s*\n', '\n\n', text)
    # Remove leading/trailing whitespace from the whole text
    text = text.strip()
    return text


# --- Main Execution Logic ---
# A dictionary to hold the cleaned text
cleaned_texts = {}

print("Starting text cleaning process...")

# Get the filenames from the previous step
pdf_filenames = list(extracted_texts.keys())

for filename in pdf_filenames:
    raw_text = extracted_texts[filename]
    
    # Apply specific cleaning function based on the filename
    if "privrulepd.pdf" in filename:
        print(f"-> Applying HIPAA cleaning rules for {filename}...")
        cleaned_text = clean_hipaa_text(raw_text)
    elif "ccpa_statute.pdf" in filename:
        print(f"-> Applying CCPA cleaning rules for {filename}...")
        cleaned_text = clean_ccpa_text(raw_text)
    elif "CELEX" in filename: # Using a unique part of the GDPR filename
        print(f"-> Applying GDPR cleaning rules for {filename}...")
        cleaned_text = clean_gdpr_text(raw_text)
    else:
        cleaned_text = raw_text # No specific rules for this file
        
    # Apply general cleaning to the result of the specific cleaning
    cleaned_texts[filename] = clean_general_text(cleaned_text)

print("\nCleaning complete!")


# --- Verification Step ---
# Print the first 1000 characters of each cleaned document to verify the changes
print("-" * 20)
for filename, text in cleaned_texts.items():
    print(f"\n--- Cleaned Sample from: {filename} ---")
    if text:
        print(text[:1000] + "...")
    else:
        print("No text after cleaning.")
    print("-" * 20)

# Optional: Save the cleaned text to files for inspection
for filename, text in cleaned_texts.items():
    output_filename = filename.replace('.pdf', '_cleaned.txt')
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(text)
print("\nCleaned text has been saved to .txt files for your review.")

Starting text cleaning process...
-> Applying HIPAA cleaning rules for privrulepd.pdf...
-> Applying CCPA cleaning rules for ccpa_statute.pdf...
-> Applying GDPR cleaning rules for CELEX_02016R0679-20160504_EN_TXT.pdf...

Cleaning complete!
--------------------

--- Cleaned Sample from: privrulepd.pdf ---
19:04 Aug 13, 2002
Jkt 197001
PO 00000
Frm 00001
Fmt 4717
Sfmt 4717

pfrm17
PsN: 14AUR4"1985 National Archives and Records Administration" seal
. 
List of Subjects 
45 CFR Part 160 
Electronic transactions, Employer 
benefit plan, Health, Health care, Health 
facilities, Health insurance, Health 
records, Medicaid, Medical research, 
Medicare, Privacy, Reporting and record 
keeping requirements. 
45 CFR Part 164 
Electronic transactions, Employer 
benefit plan, Health, Health care, Health 
facilities, Health insurance, Health 
records, Medicaid, Medical research, 
Medicare, Privacy, Reporting and record 
keeping requirements. 
Dated: August 6, 2002. 
Tommy G. Thompson, 
Secretary. 
Fo

In [4]:
# Install the library needed for the text splitter
!pip install langchain

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.76-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Downloading langchain_text_splitters-0.3.11-py3-none-any.whl.metadata (1.8 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.27-py3-none-any.whl.metadata (14 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.43-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.72->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson>=3.9.14 (from langsmith>=0.1.17->langchain)


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import fitz  # PyMuPDF
import re
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Step 1: Reload Original Text (to start fresh) ---
pdf_filenames = [
    "privrulepd.pdf",
    "ccpa_statute.pdf",
    "CELEX_02016R0679-20160504_EN_TXT.pdf"
]

extracted_texts = {}
for filename in pdf_filenames:
    doc = fitz.open(filename)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    doc.close()
    extracted_texts[filename] = full_text

# --- Step 2: Improved, More Aggressive Cleaning ---

def final_clean_text(text, doc_type):
    """Applies a final, robust cleaning pass."""
    
    # General cleaning
    text = re.sub(r'[ \t]+', ' ', text) # Normalize whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text) # Normalize paragraph breaks
    
    if doc_type == 'hipaa':
        # Remove headers, footers, and initial metadata
        text = re.sub(r'Federal Register / Vol\. \d+, No\. \d+ /.+', '', text)
        text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) # Page numbers
        start_match = re.search(r'I\. Background', text)
        if start_match:
            text = text[start_match.start():]
            
    elif doc_type == 'ccpa':
        # Aggressively remove the entire table of contents
        text = re.sub(r'Contents.*?(?=1798\.100\. General Duties)', '', text, flags=re.DOTALL)
        text = re.sub(r'Page \d+ of \d+', '', text) # Remove page footers
        
    elif doc_type == 'gdpr':
        # Remove disclaimer and document codes
        text = re.sub(r'This text is meant purely as a documentation tool.*?(?=CHAPTER I)', '', text, flags=re.DOTALL)
        text = re.sub(r'L \d+/\d+', '', text)
        text = re.sub(r'[\(][\d]+[\)]', '', text) # remove numbers in brackets
        text = re.sub(r'►[A-Z]\d*|▼[A-Z]\d*', '', text) # Remove artifacts
        
    return text.strip()


# --- Step 3: Robust Chunking and Metadata Extraction ---

heading_patterns = {
    'hipaa': r'^[A-Z]\. .+',
    'ccpa': r'^\d{4}\.\d{3,}\..+',
    'gdpr': r'^Article \d+\n'
}

doc_types = {
    "privrulepd.pdf": 'hipaa',
    "ccpa_statute.pdf": 'ccpa',
    "CELEX_02016R0679-20160504_EN_TXT.pdf": 'gdpr'
}

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""]
)

final_chunks = []
print("Starting final, refined processing...")

for filename, raw_text in extracted_texts.items():
    doc_type = doc_types[filename]
    
    # Apply final cleaning
    clean_text = final_clean_text(raw_text, doc_type)
    
    # Find all headings
    heading_pattern = heading_patterns[doc_type]
    headings = [match.group(0).strip() for match in re.finditer(heading_pattern, clean_text, re.MULTILINE)]
    
    # Split the document by these headings
    sections = re.split(heading_pattern, clean_text, flags=re.MULTILINE)
    
    # The first element is the text before the first heading
    doc_intro = sections[0].strip()
    if doc_intro:
        intro_chunks = text_splitter.split_text(doc_intro)
        for chunk in intro_chunks:
            final_chunks.append({"text": chunk, "metadata": {"source_document": filename, "heading": "Introduction"}})

    # Process text under each heading
    for i, section_text in enumerate(sections[1:]):
        section_text = section_text.strip()
        if not section_text:
            continue
        
        heading = headings[i]
        chunks = text_splitter.split_text(section_text)
        for chunk in chunks:
            final_chunks.append({"text": chunk, "metadata": {"source_document": filename, "heading": heading}})

print("\nRefined processing complete!")

# --- Verification Step ---
print(f"Total chunks created: {len(final_chunks)}")
print("\n--- Sample of CORRECT final structured data ---")
# Print a few representative chunks
print(json.dumps(final_chunks[10], indent=2))
print("-" * 20)
print(json.dumps(final_chunks[100], indent=2))
print("-" * 20)
print(json.dumps(final_chunks[200], indent=2))
print("-" * 20)

# --- Save the final output ---
output_filename = "processed_chunks.jsonl"
with open(output_filename, 'w', encoding='utf-8') as f:
    for chunk in final_chunks:
        f.write(json.dumps(chunk) + '\n')

print(f"\nSuccessfully saved all chunks to {output_filename}. Phase 1 is now truly complete!")

MuPDF error: format error: cmsOpenProfileFromMem failed

Starting final, refined processing...

Refined processing complete!
Total chunks created: 1376

--- Sample of CORRECT final structured data ---
{
  "text": "on August 21\u201323, 2001, and January 24\u2013 \n25, 2002, and provided \nrecommendations to the Department \nbased on these hearings. The NCVHS \nserves as the statutory advisory body to \nthe Secretary of HHS with respect to the \ndevelopment and implementation of the \nRules required by the Administrative \nSimplification provisions of HIPAA, \nincluding the privacy standards. \nThrough the hearings, the NCVHS \nspecifically solicited public input on \nissues related to certain key standards in \nthe Privacy Rule: consent, minimum \nnecessary, marketing, fundraising, and \nresearch. The resultant public testimony \nand subsequent recommendations \nsubmitted to the Department by the \nNCVHS also served to inform the \ndevelopment of these proposed \nmodifications. \nII. O