In [5]:
import fitz
import re
pdf_path = "../dataset/pdfs/toxicMeter.pdf" 
doc = fitz.open(pdf_path)

In [2]:
import fitz  # PyMuPDF
from collections import Counter

def extract_font_details(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF
    font_data = []  # Store all (font_name, font_size) occurrences

    for page_num in range(1,len(doc)):
        page = doc[page_num]
        text_instances = page.get_text("dict")["blocks"]  # Extract text blocks

        for block in text_instances:
            if "lines" in block:  # Ensure block contains text
                for line in block["lines"]:
                    for span in line["spans"]:  # Spans contain font information
                        font_name = span["font"]
                        font_size = round(span["size"])  # Round font size for consistency
                        font_data.append((font_name, font_size))

    if not font_data:
        return {"error": "No font data found in the document."}

    # Count occurrences of font sizes
    font_size_counts = Counter(size for _, size in font_data)
    
    # Determine body font size (most frequent)
    most_common_font_size, _ = font_size_counts.most_common(1)[0]  

    # Determine heading font size
    sorted_font_sizes = sorted(font_size_counts.items(), key=lambda x: x[0], reverse=True)  # Sort by size desc

    heading_font_size = None
    for i, (size, count) in enumerate(sorted_font_sizes):
        if i == 0 and count < 5:  # Drop max size if occurrences < 5
            continue
        heading_font_size = size
        break

    if not heading_font_size:
        heading_font_size = most_common_font_size  # Fallback to body size if no valid heading font

    # Identify corresponding font names
    body_font = next((font for font, size in font_data if size == most_common_font_size), "Unknown")
    heading_font = next((font for font, size in font_data if size == heading_font_size), "Unknown")

    # Return extracted font details
    return {
        "body_font_type": body_font,
        "body_font_size": most_common_font_size,
        "heading_font_type": heading_font,
        "heading_font_size": heading_font_size
    }

# Example usage
 # Replace with the actual PDF path
font_details = extract_font_details(pdf_path)
print(font_details)


{'body_font_type': 'TimesNewRomanPSMT', 'body_font_size': 12, 'heading_font_type': 'TimesNewRomanPS-BoldMT', 'heading_font_size': 18}


In [2]:
import fitz  # PyMuPDF

def extract_toc_info(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join([page.get_text("text") for page in doc])  # Extract all text
    
    # Check for Table of Contents presence
    toc_present = "Table of Contents" in text or "TABLE OF CONTENTS" in text

    # Extract fonts from the first few pages
    font_sizes = []
    for page in doc[:5]:  # Check first 5 pages for heading fonts
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line["spans"]:
                    font_sizes.append(round(span["size"]))

    # Determine heading and subheading font sizes
    if font_sizes:
        heading_font_size = max(font_sizes)  # Largest font used
        subheading_font_size = sorted(set(font_sizes), reverse=True)[1] if len(set(font_sizes)) > 1 else heading_font_size
    else:
        heading_font_size = subheading_font_size = None

    return {
        "table_of_contents": {
            "toc_present": toc_present,
            "heading_font_size": heading_font_size,
            "subheading_font_size": subheading_font_size
        }
    }

toc_info = extract_toc_info("toxicMeter.pdf")
print(toc_info)


{'table_of_contents': {'toc_present': True, 'heading_font_size': 16, 'subheading_font_size': 14}}


In [5]:
from collections import Counter
def extract_lof_info(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join([page.get_text("text") for page in doc])
    
    # Check for List of Figures presence
    lof_present = "List of Figures" in text or "LIST OF FIGURES" in text

    # Extract font size of figure captions (Search for 'Figure' keyword)
    caption_sizes = []
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line["spans"]:
                    if "Figure" in span["text"]:  # Detect captions
                        caption_sizes.append(round(span["size"]))

    #Find the most frequent figure caption font size
    font_counts = Counter(caption_sizes)
    figure_caption_font_size = font_counts.most_common(1)[0][0] if font_counts else None

    return {
        "list_of_figures": {
            "lof_present": lof_present,
            "figure_caption_font_size": figure_caption_font_size
        }
    }

lof_info = extract_lof_info("toxicMeter.pdf")
print(lof_info)


{'list_of_figures': {'lof_present': True, 'figure_caption_font_size': 12}}


In [15]:
import fitz  # PyMuPDF
import re
from collections import Counter

def extract_abbreviations_info(pdf_path):
    doc = fitz.open(pdf_path)
    abbreviations_page = None
    heading_font_size = None
    abbreviations_dict = {}

    # Step 1: Identify general heading font size from the first few pages
    font_sizes = []
    for page in doc[:5]:  # Check first 5 pages to identify common heading font
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" not in block:
                continue  # Skip blocks without text lines
            for line in block["lines"]:
                for span in line["spans"]:
                    font_sizes.append(round(span["size"]))

    # Determine most common heading font size
    general_heading_font_size = max(font_sizes) if font_sizes else None

    if general_heading_font_size is None:
        return {
            "abbreviations_section": {
                "abbreviations_section_present": False,
                "abbreviations_sorted": "N/A",
                "abbreviations": {}
            }
        }

    # Step 2: Locate "List of Abbreviations" with the matched heading font size
    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    if span["size"] == general_heading_font_size and (
                        "List of Abbreviations" in span["text"] or "ABBREVIATIONS" in span["text"]
                    ):
                        abbreviations_page = page_num
                        break
            if abbreviations_page is not None:
                break
        if abbreviations_page is not None:
            break

    if abbreviations_page is None:
        return {
            "abbreviations_section": {
                "abbreviations_section_present": False,
                "abbreviations_sorted": "N/A",
                "abbreviations": {}
            }
        }

    # Step 3: Extract abbreviations from the identified page
    text = doc[abbreviations_page].get_text("dict")
    capture_abbreviations = False  # Start capturing only after heading

    abbreviation_pattern = re.compile(r"^([A-Z0-9\-]+)[:\s]+(.+)")  # Match "ABC: Definition"

    for block in text["blocks"]:
        if "lines" not in block:
            continue  # Skip empty blocks
        for line in block["lines"]:
            for span in line["spans"]:
                font_size = span["size"]
                content = span["text"].strip()

                # Step 4: Detect start of abbreviations section
                if font_size == general_heading_font_size and ("List of Abbreviations" in content or "ABBREVIATIONS" in content):
                    capture_abbreviations = True
                    continue

                # Step 5: Capture abbreviations only after the heading
                if capture_abbreviations:
                    match = abbreviation_pattern.match(content)
                    if match:
                        abbrev, definition = match.groups()
                        abbreviations_dict[abbrev] = definition

    # Step 6: Check if abbreviations are sorted
    sorted_status = "asc" if list(abbreviations_dict.keys()) == sorted(abbreviations_dict.keys()) else "unsorted"

    return {
        "abbreviations_section": {
            "abbreviations_section_present": True,
            "abbreviations_sorted": sorted_status,
            "abbreviations": abbreviations_dict
        }
    }

# Run the function
abbr_info = extract_abbreviations_info("toxicMeter.pdf")
print(abbr_info)


{'abbreviations_section': {'abbreviations_section_present': False, 'abbreviations_sorted': 'N/A', 'abbreviations': {}}}


# NEW SECTION

In [1]:
import fitz
import re
pdf_path = "../dataset/pdfs/toxicMeter.pdf" 
doc = fitz.open(pdf_path)

Table of Content Extractor

In [38]:
import fitz  # PyMuPDF
import re
import json

def table_of_content_extractor(doc):
    toc_text = ""
    toc_found = False  # Flag to track if ToC has started
    potential_toc = []  # Store potential ToC lines
    toc_fonts = []  # Store font size information
    toc_heading_size = None  # Track ToC heading font size (only once)
    subheading_sizes = set()  # Track unique subheading font sizes

    # Iterate through first few pages (ToC is usually at the beginning)
    for page_num in range(min(10, len(doc))):  # Scan first 10 pages
        text_blocks = doc[page_num].get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = round(span["size"])  # Extract font size
                        line_text = span["text"].strip()

                        # Detect ToC heading (ensure we only capture the first occurrence)
                        if re.search(r"\b(Table\s*of\s*Contents|Contents|Index)\b", line_text, re.IGNORECASE):
                            if not toc_found:  # Only set once
                                toc_found = True  
                                toc_heading_size = font_size  # Store only the first ToC heading font size
                                toc_fonts.append(("Heading", line_text, font_size))
                            toc_text += line_text + "\n"
                            continue

                        # If ToC has started, keep extracting until a matching font size is detected
                        if toc_found:
                            # Stop when encountering a section heading of the same size as the ToC heading
                            if font_size == toc_heading_size:
                                return {
                                    "table_of_contents": {
                                        "toc_present": True,
                                        "heading_font_size": toc_heading_size,
                                        "subheading_font_size": max(subheading_sizes) if subheading_sizes else None
                                    }
                                }

                            # Identify ToC subheadings based on detected text patterns
                            if re.match(r"^\d+(\.\d+)*\s+[A-Za-z\s]+", line_text):  
                                subheading_sizes.add(font_size)  # Store unique subheading font sizes
                                toc_fonts.append(("Subheading", line_text, font_size))
                            else:
                                toc_fonts.append(("Regular", line_text, font_size))

                            potential_toc.append(line_text)
                            toc_text += line_text + "\n"

    # If no ToC found, return False
    return {
        "table_of_contents": {
            "toc_present": False,
            "heading_font_size": None,
            "subheading_font_size": None
        }
    }

# Example usage
toc_data = table_of_content_extractor(doc)

# Print the JSON output
print(json.dumps(toc_data, indent=2))


{
  "table_of_contents": {
    "toc_present": true,
    "heading_font_size": 16,
    "subheading_font_size": 12
  }
}


List of Figure Extractor

In [41]:
import fitz  # PyMuPDF
import re
import json

def list_of_figures_extractor(pdf_path):
    doc = fitz.open(pdf_path)
    lof_text = ""
    lof_found = False  # Flag to track if LoF has started
    figure_caption_sizes = set()  # Store unique font sizes of figure captions
    lof_heading_size = None  # Track LoF heading font size

    # Iterate through first few pages (LoF is usually at the beginning)
    for page_num in range(min(10, len(doc))):  # Scan first 10 pages
        text_blocks = doc[page_num].get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = round(span["size"])  # Extract font size
                        line_text = span["text"].strip()

                        # Detect LoF heading (ensuring we only capture the first occurrence)
                        if re.search(r"\b(List\s*of\s*Figures|Figures|Figure Index)\b", line_text, re.IGNORECASE):
                            if not lof_found:  # Only set once
                                lof_found = True  
                                lof_heading_size = font_size  # Store LoF heading font size
                            lof_text += line_text + "\n"
                            continue

                        # If LoF has started, keep extracting until a matching font size is detected
                        if lof_found:
                            # Identify figure captions (likely smaller font size than heading)
                            if re.match(r"^(Figure|Fig\.|Table)\s+\d+[:.\s]", line_text):  
                                figure_caption_sizes.add(font_size)  # Store caption font size

                            # Stop when encountering a section heading of the same size as the LoF heading,
                            # BUT only if we have already captured some figure captions.
                            if font_size == lof_heading_size and figure_caption_sizes:
                                return {
                                    "list_of_figures": {
                                        "lof_present": True,
                                        "figure_caption_font_size": max(figure_caption_sizes)  # Return largest detected caption font
                                    }
                                }

                            lof_text += line_text + "\n"

    # If no LoF found, return False
    return {
        "list_of_figures": {
            "lof_present": False,
            "figure_caption_font_size": None
        }
    }

# Example usage
lof_data = list_of_figures_extractor(pdf_path)

# Print the JSON output
print(json.dumps(lof_data, indent=2))


{
  "list_of_figures": {
    "lof_present": true,
    "figure_caption_font_size": 12
  }
}


Abbreviations Extractor

In [46]:
import fitz  # PyMuPDF
import re
import json

def extract_abbreviations_section(pdf_path):
    doc = fitz.open(pdf_path)
    abbreviations_text = ""
    abbreviations_found = False  # Flag to track if abbreviations section has started
    abbreviations = []  # Store extracted abbreviations
    abbreviation_heading_size = None  # Store heading font size

    # Iterate through first few pages (Abbreviations is usually at the beginning or middle)
    for page_num in range(min(15, len(doc))):  # Scan first 15 pages
        text_blocks = doc[page_num].get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = round(span["size"])  # Extract font size
                        line_text = span["text"].strip()

                        # Detect Abbreviations heading
                        if re.search(r"\b(Abbreviations|List of Abbreviations|Acronyms)\b", line_text, re.IGNORECASE):
                            if not abbreviations_found:  # Capture only the first heading
                                abbreviations_found = True  
                                abbreviation_heading_size = font_size  # Store the font size of this section
                            abbreviations_text += line_text + "\n"
                            continue

                        # If Abbreviations section has started, keep extracting until a new section heading is found
                        if abbreviations_found:
                            # Stop when encountering a section heading of the same size as the Abbreviations heading
                            if font_size == abbreviation_heading_size:
                                return {
                                    "abbreviations_section": {
                                        "abbreviations_section_present": True,
                                        "abbreviations_sorted": check_sorting_order(abbreviations)  # Determine sorting order
                                    }
                                }

                            # Alternative method to extract abbreviations
                            extracted_abbreviation = extract_abbreviation_from_line(line_text)
                            print(extracted_abbreviation)
                            if extracted_abbreviation:
                                abbreviations.append(extracted_abbreviation)

                            abbreviations_text += line_text + "\n"

    # If no abbreviations section found, return False
    return {
        "abbreviations_section": {
            "abbreviations_section_present": False,
            "abbreviations_sorted": None
        }
    }

def extract_abbreviation_from_line(line_text):
    """
    Extract abbreviations using different patterns:
    1. AI (Artificial Intelligence)
    2. CNN: Convolutional Neural Network
    3. NLP – Natural Language Processing
    """
    print(line_text)
    # Look for patterns like "AI (Artificial Intelligence)"
    match_parentheses = re.match(r"^([A-Z]{2,})\s*\((.+?)\)$", line_text)
    if match_parentheses:
        return match_parentheses.group(1)  # Extract "AI"

    # Look for patterns like "CNN: Convolutional Neural Network"
    match_colon = re.match(r"^([A-Z]{2,})\s*:\s*(.+)$", line_text)
    if match_colon:
        return match_colon.group(1)  # Extract "CNN"

    # Look for patterns like "NLP – Natural Language Processing"
    match_dash = re.match(r"^([A-Z]{2,})\s*[-–—]\s*(.+)$", line_text)
    if match_dash:
        return match_dash.group(1)  # Extract "NLP"

    return None  # No match found

def check_sorting_order(abbreviations):
    """Determine if abbreviations are sorted in ascending, descending, or no order."""
    if abbreviations == sorted(abbreviations):
        return "asc"
    elif abbreviations == sorted(abbreviations, reverse=True):
        return "desc"
    else:
        return "none"


abbreviations_data = extract_abbreviations_section(pdf_path)

# Print the JSON output
print(json.dumps(abbreviations_data, indent=2))


{
  "abbreviations_section": {
    "abbreviations_section_present": true,
    "abbreviations_sorted": "asc"
  }
}


In [47]:
import fitz  # PyMuPDF
import re
import json

def extract_abbreviations_section(pdf_path):
    doc = fitz.open(pdf_path)
    abbreviations_text = ""
    abbreviations_found = False  # Flag to track if abbreviations section has started
    abbreviations = []  # Store extracted abbreviations

    # Iterate through the first few pages (Abbreviations section is usually in the early pages)
    for page_num in range(min(15, len(doc))):  # Scan first 15 pages
        text_blocks = doc[page_num].get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = round(span["size"])  # Extract font size
                        line_text = span["text"].strip()

                        # Detect Abbreviations heading
                        if re.search(r"\b(Abbreviations|List of Abbreviations|Acronyms)\b", line_text, re.IGNORECASE):
                            abbreviations_found = True  # Start collecting abbreviations
                            abbreviations_text += line_text + "\n"
                            continue

                        # If Abbreviations section has started, extract abbreviations
                        if abbreviations_found:
                            extracted_abbreviation = extract_abbreviation_from_line(line_text)
                            if extracted_abbreviation:
                                abbreviations.append(extracted_abbreviation)

                            abbreviations_text += line_text + "\n"

                            # Detect potential new section heading (Stop extraction here)
                            if is_new_section_heading(line_text):
                                return {
                                    "abbreviations_section": {
                                        "abbreviations_section_present": True,
                                        "abbreviations_sorted": check_sorting_order(abbreviations)  # Determine sorting order
                                    }
                                }

    # If no abbreviations section found, return False
    return {
        "abbreviations_section": {
            "abbreviations_section_present": False,
            "abbreviations_sorted": None
        }
    }

def extract_abbreviation_from_line(line_text):
    """
    Extract abbreviations from a line by detecting:
    1. AI - Artificial Intelligence
    2. CNN : Convolutional Neural Network
    3. NLP – Natural Language Processing
    """
    match = re.match(r"^([A-Z0-9]{2,})\s*[-–—:]\s*(.+)$", line_text)
    if match:
        return match.group(1)  # Extract only the short form (e.g., AI, CNN, NLP)

    return None  # No abbreviation found

def is_new_section_heading(line_text):
    """
    Detect if a line is likely a new section heading:
    - Fully capitalized words (e.g., "INTRODUCTION", "METHODS")
    - Very short headings (1-3 words)
    """
    if line_text.isupper() and len(line_text.split()) <= 5:
        return True  # Likely a new section heading

    return False  # Continue extracting abbreviations

def check_sorting_order(abbreviations):
    """Determine if abbreviations are sorted in ascending, descending, or no order."""
    if abbreviations == sorted(abbreviations):
        return "asc"
    elif abbreviations == sorted(abbreviations, reverse=True):
        return "desc"
    else:
        return "none"

# Example usage
abbreviations_data = extract_abbreviations_section(pdf_path)

# Print the JSON output
print(json.dumps(abbreviations_data, indent=2))


{
  "abbreviations_section": {
    "abbreviations_section_present": false,
    "abbreviations_sorted": null
  }
}


FIGURE PLACEMENT EXTRACTION

In [49]:
import fitz  # PyMuPDF
import json
from collections import Counter

def figure_data_extractor(pdf_path):
    doc = fitz.open(pdf_path)
    page_width = doc[0].rect.width  # Get the width of the first page for alignment checks

    figure_placements = []  # Store placement data
    caption_positions = []  # Store caption positions
    caption_font_sizes = []  # Store caption font sizes

    for page_num in range(len(doc)):  # Loop through all pages
        page = doc[page_num]
        images = page.get_images(full=True)  # Extract all images (figures)

        for img_index, img in enumerate(images):
            xref = img[0]  # Get image reference
            bbox = page.get_image_rects(xref)[0]  # Get bounding box of image (figure)

            # Determine figure alignment
            fig_x0, fig_y0, fig_x1, fig_y1 = bbox
            fig_width = fig_x1 - fig_x0
            page_center_x = page_width / 2

            if abs((fig_x0 + fig_x1) / 2 - page_center_x) < fig_width * 0.1:
                placement = "center"
            elif fig_x0 < page_width * 0.3:
                placement = "left"
            else:
                placement = "right"

            # Find figure caption (text near the figure)
            figure_caption, caption_font_size, caption_position = find_figure_caption(page, bbox)

            if caption_font_size:
                caption_font_sizes.append(caption_font_size)

            if placement:
                figure_placements.append(placement)

            if caption_position:
                caption_positions.append(caption_position)

    # Determine most frequent values
    most_common_placement = most_frequent(figure_placements)
    most_common_caption_position = most_frequent(caption_positions)
    most_common_caption_font_size = most_frequent(caption_font_sizes)

    return {
        "figure_placement": {
            "figure_caption_font_size": most_common_caption_font_size,
            "figure_placement": most_common_placement,
            "figure_caption_position": most_common_caption_position
        }
    }

def find_figure_caption(page, bbox):
    """
    Find the caption text near the figure.
    - If text appears *below* the figure, return "below"
    - If text appears *above* the figure, return "above"
    """
    fig_x0, fig_y0, fig_x1, fig_y1 = bbox
    caption_text = None
    caption_font_size = None
    caption_position = None

    for block in page.get_text("dict")["blocks"]:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text_y0 = line["bbox"][1]  # Get Y-position of text
                    font_size = round(span["size"])  # Extract font size
                    line_text = span["text"].strip()

                    # Check if text is directly below the figure
                    if fig_y1 < text_y0 < fig_y1 + font_size * 3:
                        caption_text = line_text
                        caption_font_size = font_size
                        caption_position = "below"
                        return caption_text, caption_font_size, caption_position

                    # Check if text is directly above the figure
                    if fig_y0 - font_size * 3 < text_y0 < fig_y0:
                        caption_text = line_text
                        caption_font_size = font_size
                        caption_position = "above"
                        return caption_text, caption_font_size, caption_position

    return caption_text, caption_font_size, caption_position

def most_frequent(lst):
    """Find the most common element in a list"""
    if not lst:
        return None
    return Counter(lst).most_common(1)[0][0]

# Example usage
figure_placement_data = figure_data_extractor(pdf_path)

# Print the JSON output
print(json.dumps(figure_placement_data, indent=2))


{
  "figure_placement": {
    "figure_caption_font_size": 12,
    "figure_placement": "center",
    "figure_caption_position": "above"
  }
}


TABLE PLACEMENT EXTRACTION

In [50]:
import fitz  # PyMuPDF
import json
from collections import Counter

def extract_table_placement(pdf_path):
    doc = fitz.open(pdf_path)
    page_width = doc[0].rect.width  # Get the width of the first page for alignment checks

    table_placements = []  # Store table alignment
    caption_positions = []  # Store caption positions
    caption_font_sizes = []  # Store caption font sizes

    for page_num in range(len(doc)):  # Loop through all pages
        page = doc[page_num]
        tables = find_tables(page)  # Find potential table bounding boxes

        for table_bbox in tables:
            # Determine table alignment
            table_x0, table_y0, table_x1, table_y1 = table_bbox
            table_width = table_x1 - table_x0
            page_center_x = page_width / 2

            if abs((table_x0 + table_x1) / 2 - page_center_x) < table_width * 0.1:
                placement = "center"
            elif table_x0 < page_width * 0.3:
                placement = "left"
            else:
                placement = "right"

            # Find table caption (text near the table)
            table_caption, caption_font_size, caption_position = find_table_caption(page, table_bbox)

            if caption_font_size:
                caption_font_sizes.append(caption_font_size)

            if placement:
                table_placements.append(placement)

            if caption_position:
                caption_positions.append(caption_position)

    # Determine most frequent values
    most_common_placement = most_frequent(table_placements)
    most_common_caption_position = most_frequent(caption_positions)
    most_common_caption_font_size = most_frequent(caption_font_sizes)

    return {
        "table_placement": {
            "table_caption_font_size": most_common_caption_font_size,
            "table_placement": most_common_placement,
            "table_caption_position": most_common_caption_position
        }
    }

def find_tables(page):
    """
    Identify potential tables by detecting large structured text blocks.
    - Looks for multiple consecutive text lines forming a structured shape.
    """
    table_bboxes = []

    for block in page.get_text("dict")["blocks"]:
        if "lines" in block and len(block["lines"]) > 2:  # More than 2 rows indicates a possible table
            bbox = block["bbox"]
            table_bboxes.append(bbox)

    return table_bboxes

def find_table_caption(page, bbox):
    """
    Find the caption text near the table.
    - If text appears *below* the table, return "below"
    - If text appears *above* the table, return "above"
    """
    table_x0, table_y0, table_x1, table_y1 = bbox
    caption_text = None
    caption_font_size = None
    caption_position = None

    for block in page.get_text("dict")["blocks"]:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text_y0 = line["bbox"][1]  # Get Y-position of text
                    font_size = round(span["size"])  # Extract font size
                    line_text = span["text"].strip()

                    # Check if text is directly below the table
                    if table_y1 < text_y0 < table_y1 + font_size * 3:
                        caption_text = line_text
                        caption_font_size = font_size
                        caption_position = "below"
                        return caption_text, caption_font_size, caption_position

                    # Check if text is directly above the table
                    if table_y0 - font_size * 3 < text_y0 < table_y0:
                        caption_text = line_text
                        caption_font_size = font_size
                        caption_position = "above"
                        return caption_text, caption_font_size, caption_position

    return caption_text, caption_font_size, caption_position

def most_frequent(lst):
    """Find the most common element in a list"""
    if not lst:
        return None
    return Counter(lst).most_common(1)[0][0]

# Example usage
table_placement_data = extract_table_placement(pdf_path)

# Print the JSON output
print(json.dumps(table_placement_data, indent=2))


{
  "table_placement": {
    "table_caption_font_size": 12,
    "table_placement": "right",
    "table_caption_position": "above"
  }
}


TEXT ALIGNMENT

In [56]:
import fitz  # PyMuPDF
import json
from collections import Counter

def extract_text_alignment(pdf_path):
    doc = fitz.open(pdf_path)

    alignment_counts = []  # Store detected alignments

    for page_num in range(len(doc)):  # Loop through all pages
        page = doc[page_num]
        text_blocks = page.get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                left_margins = []
                right_margins = []

                for line in block["lines"]:
                    x0, _, x1, _ = line["bbox"]  # Get left & right positions of the line
                    left_margins.append(x0)
                    right_margins.append(x1)

                # Compute alignment by analyzing variation in margins
                left_variation = max(left_margins) - min(left_margins) if left_margins else 0
                right_variation = max(right_margins) - min(right_margins) if right_margins else 0

                if left_variation < 5 and right_variation < 5:
                    alignment_counts.append("Justified")
                elif left_variation < 5:
                    alignment_counts.append("Left")
                elif right_variation < 5:
                    alignment_counts.append("Right")
                else:
                    alignment_counts.append("Mixed")

    # Determine most frequent text alignment
    most_common_alignment = most_frequent(alignment_counts)

    return {
        "text_alignment": {
            "text_alignment": most_common_alignment
        }
    }

def most_frequent(lst):
    """Find the most common element in a list"""
    if not lst:
        return None
    return Counter(lst).most_common(1)[0][0]

# Example usage
text_alignment_data = extract_text_alignment(pdf_path)

# Print the JSON output
print(json.dumps(text_alignment_data, indent=2))


{
  "text_alignment": {
    "text_alignment": "Justified"
  }
}


MARGIN EXTRACTOR

In [58]:
import fitz  # PyMuPDF
import json

def extract_margins(pdf_path):
    doc = fitz.open(pdf_path)
    margin_values = {"left": [], "right": [], "top": [], "bottom": []}

    for page in doc:
        page_width, page_height = page.rect.width, page.rect.height  # Page size in points
        text_blocks = page.get_text("dict")["blocks"]

        # Initialize extreme values for text placement
        leftmost = page_width
        rightmost = 0
        topmost = page_height
        bottommost = 0

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    x0, y0, x1, y1 = line["bbox"]  # Bounding box (left, top, right, bottom)
                    leftmost = min(leftmost, x0)
                    rightmost = max(rightmost, x1)
                    topmost = min(topmost, y0)
                    bottommost = max(bottommost, y1)

        # Calculate margins in inches (1 inch = 72 points)
        left_margin = round(leftmost / 72, 2)
        right_margin = round((page_width - rightmost) / 72, 2)
        top_margin = round(topmost / 72, 2)
        bottom_margin = round((page_height - bottommost) / 72, 2)

        margin_values["left"].append(left_margin)
        margin_values["right"].append(right_margin)
        margin_values["top"].append(top_margin)
        margin_values["bottom"].append(bottom_margin)

    # Compute average margins across pages
    avg_left_margin = round(sum(margin_values["left"]) / len(margin_values["left"]), 2)
    avg_right_margin = round(sum(margin_values["right"]) / len(margin_values["right"]), 2)
    avg_top_margin = round(sum(margin_values["top"]) / len(margin_values["top"]), 2)
    avg_bottom_margin = round(sum(margin_values["bottom"]) / len(margin_values["bottom"]), 2)

    return {
        "margins": {
            "left_margin_inch": round(avg_left_margin),
            "right_margin_inch": round(avg_right_margin),
            "top_margin_inch": round(avg_top_margin),
            "bottom_margin_inch": round(avg_bottom_margin)
        }
    }

# Example usage
margins_data = extract_margins(pdf_path)

# Print the JSON output
print(json.dumps(margins_data, indent=2))


{
  "margins": {
    "left_margin_inch": 1,
    "right_margin_inch": 1,
    "top_margin_inch": 2,
    "bottom_margin_inch": 1
  }
}


FONT EXTRACTOR

In [60]:
import fitz  # PyMuPDF
import json
from collections import Counter

def extract_font_data(pdf_path):
    doc = fitz.open(pdf_path)
    # skip first page from doc
    doc = doc[1:]
    font_sizes = []
    font_types = []
    heading_fonts = []
    
    for page in doc:
        text_blocks = page.get_text("dict")["blocks"]
        
        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = round(span["size"])  # Extract font size
                        font_type = span["font"]  # Extract font type
                        font_sizes.append(font_size)
                        font_types.append(font_type)

    # Identify the most common font type & size for body text
    most_common_body_font = most_frequent(font_types)
    most_common_body_size = most_frequent(font_sizes)

    # Identify the most common font & size for headings (largest text)
    max_font_size = max(font_sizes) if font_sizes else None
    heading_fonts = [font for font, size in zip(font_types, font_sizes) if size == max_font_size]
    most_common_heading_font = most_frequent(heading_fonts)

    return {
        "font_type_size": {
            "body_font_type": most_common_body_font,
            "body_font_size": most_common_body_size,
            "heading_font_type": most_common_heading_font,
            "heading_font_size": max_font_size
        }
    }

def most_frequent(lst):
    """Find the most common element in a list"""
    if not lst:
        return None
    return Counter(lst).most_common(1)[0][0]

# Example usage
font_data = extract_font_data(pdf_path)

# Print the JSON output
print(json.dumps(font_data, indent=2))


{
  "font_type_size": {
    "body_font_type": "TimesNewRomanPSMT",
    "body_font_size": 12,
    "heading_font_type": "TimesNewRomanPS-BoldMT",
    "heading_font_size": 18
  }
}


REFERENCE FORMATTINGS

In [61]:
import fitz  # PyMuPDF
import re
import json
from collections import Counter

def extract_references_formatting(pdf_path):
    doc = fitz.open(pdf_path)
    references_text = ""
    references_found = False  # Track if References section is found
    references_list = []  # Store extracted references
    reference_format = None

    for page_num in range(len(doc)):  # Loop through pages
        text_blocks = doc[page_num].get_text("text").split("\n")  # Extract text line by line

        for line in text_blocks:
            line = line.strip()

            # Detect References section heading
            if re.search(r"\b(References|Bibliography|Works Cited)\b", line, re.IGNORECASE):
                references_found = True
                continue  # Move to the next line

            # If References section is found, collect reference entries
            if references_found:
                if re.match(r"^\[?\d+\]?", line) or re.search(r"\(\d{4}\)", line) or re.match(r"^\w+\.", line):
                    references_list.append(line)

            # Stop extraction when encountering a new section (empty line or unrelated text)
            if references_found and line == "":
                break

    # Determine reference format
    reference_format = detect_reference_format(references_list)

    # Check citation consistency
    consistent_format = len(set(reference_format)) == 1 if reference_format else False

    return {
        "references_formatting": {
            "references_format": reference_format[0] if reference_format else None,
            "citations_consistent": consistent_format
        }
    }

def detect_reference_format(references):
    """
    Detect the reference format: IEEE, APA, or MLA.
    """
    formats_detected = []

    for ref in references:
        if re.match(r"^\[\d+\]", ref):  # IEEE format (numbered)
            formats_detected.append("IEEE")
        elif re.search(r"\(\d{4}\)", ref):  # APA format (year in parentheses)
            formats_detected.append("APA")
        elif re.match(r"^\w+\.", ref) and not re.search(r"\(\d{4}\)", ref):  # MLA format (author + title)
            formats_detected.append("MLA")

    return list(set(formats_detected))  # Return unique formats detected

# Example usage
references_data = extract_references_formatting(pdf_path)

# Print the JSON output
print(json.dumps(references_data, indent=2))


{
  "references_formatting": {
    "references_format": null,
    "citations_consistent": false
  }
}


In [62]:
import fitz  # PyMuPDF
import re
import json
from collections import Counter

def extract_references_format(pdf_path):
    doc = fitz.open(pdf_path)
    references_found = False
    references_text = []
    
    # Scan pages to find the References section
    for page in doc:
        text_blocks = page.get_text("dict")["blocks"]

        for block in text_blocks:
            if "lines" in block:
                for line in block["lines"]:
                    line_text = line["spans"][0]["text"].strip()

                    # Detect References section
                    if re.search(r"\b(References|Bibliography|Citations)\b", line_text, re.IGNORECASE):
                        references_found = True
                        continue  # Move to extracting references

                    # Extract reference lines until a new section starts
                    if references_found:
                        if is_new_section_heading(line_text):
                            break
                        references_text.append(line_text)

    # Classify Reference Format
    reference_format = classify_reference_format(references_text)
    consistency = check_reference_consistency(references_text, reference_format)

    return {
        "references_formatting": {
            "references_format": reference_format,
            "citations_consistent": consistency
        }
    }

def classify_reference_format(references):
    """
    Classifies the reference format based on common citation styles:
    - IEEE: [1] Author, "Title," etc.
    - APA: (Author, Year)
    - MLA: Author. "Title." etc.
    """
    ieee_pattern = re.compile(r"^\[\d+\]\s+.+")
    apa_pattern = re.compile(r".+\(\d{4}\)\..+")
    mla_pattern = re.compile(r"^[A-Z][a-z]+,\s[A-Z][a-z]+\..+")

    ieee_count = sum(1 for ref in references if ieee_pattern.match(ref))
    apa_count = sum(1 for ref in references if apa_pattern.match(ref))
    mla_count = sum(1 for ref in references if mla_pattern.match(ref))

    # Determine dominant format
    if ieee_count > apa_count and ieee_count > mla_count:
        return "IEEE"
    elif apa_count > ieee_count and apa_count > mla_count:
        return "APA"
    elif mla_count > ieee_count and mla_count > apa_count:
        return "MLA"
    else:
        return "Unknown"

def check_reference_consistency(references, detected_format):
    """Checks if most references follow the detected citation format."""
    if detected_format == "Unknown":
        return False
    format_counts = Counter([classify_reference_format([ref]) for ref in references])
    return format_counts[detected_format] / max(1, len(references)) > 0.8  # At least 80% must match

def is_new_section_heading(line_text):
    """
    Detects if a line is likely a new section heading:
    - Fully capitalized words (e.g., "INTRODUCTION", "CONCLUSION")
    - Short phrases (1-5 words)
    """
    return line_text.isupper() and len(line_text.split()) <= 5


references_data = extract_references_format(pdf_path)

# Print the JSON output
print(json.dumps(references_data, indent=2))


{
  "references_formatting": {
    "references_format": "Unknown",
    "citations_consistent": false
  }
}


In [63]:
import fitz  # PyMuPDF
import re
import json
from collections import Counter

def extract_references_formatting(pdf_path):
    doc = fitz.open(pdf_path)
    references_text = ""
    references_found = False  # Track if References section is found
    references_list = []  # Store extracted references

    for page_num in range(len(doc)):  # Loop through pages
        text_blocks = doc[page_num].get_text("text").split("\n")  # Extract text line by line

        for line in text_blocks:
            line = line.strip()

            # Detect References section heading
            if re.search(r"\b(References|Bibliography|Works Cited)\b", line, re.IGNORECASE):
                references_found = True
                continue  # Move to the next line

            # If References section is found, collect reference entries
            if references_found:
                if re.match(r"^\[\d+\]", line) or re.search(r"\(\d{4}\)", line) or re.match(r"^\w+\.", line):
                    references_list.append(line)

            # Stop extraction when encountering a new section heading (fully uppercase or unrelated keywords)
            if references_found and re.match(r"^[A-Z\s]+$", line) and len(line.split()) <= 5:
                break

    # Determine reference format
    detected_formats = detect_reference_format(references_list)

    # Check citation consistency
    consistent_format = len(set(detected_formats)) == 1 if detected_formats else False

    return {
        "references_formatting": {
            "references_format": detected_formats[0] if detected_formats else None,
            "citations_consistent": consistent_format
        }
    }

def detect_reference_format(references):
    """
    Detect the reference format: IEEE, APA, or MLA.
    """
    formats_detected = []

    for ref in references:
        if re.match(r"^\[\d+\]", ref):  # IEEE format (numbered)
            formats_detected.append("IEEE")
        elif re.search(r"\(\d{4}\)", ref):  # APA format (year in parentheses)
            formats_detected.append("APA")
        elif re.match(r"^\w+\.", ref) and not re.search(r"\(\d{4}\)", ref):  # MLA format (author + title)
            formats_detected.append("MLA")

    return list(set(formats_detected))  # Return unique formats detected

# Example usage
references_data = extract_references_formatting(pdf_path)

# Print the JSON output
print(json.dumps(references_data, indent=2))


{
  "references_formatting": {
    "references_format": "MLA",
    "citations_consistent": false
  }
}


ABBREVIATIONS

In [8]:
import fitz  # PyMuPDF
import re
import json

def abbreviations_extractor(pdf_path):
    doc = fitz.open(pdf_path)
    abbreviations = []  # Store extracted abbreviations
    abbreviations_found = False  # Track if we are in the abbreviations list

    for page_num in range(min(15, len(doc))):  # Scan first 15 pages
        text_blocks = doc[page_num].get_text("text").split("\n")  # Extract text line by line
        lines = [line.strip() for line in text_blocks if line.strip()]  # Remove empty lines

        for line in lines:
            # Detect Abbreviations heading
            if re.search(r"\b(Abbreviations|List of Abbreviations|Acronyms)\b", line, re.IGNORECASE):
                abbreviations_found = True  # Start collecting abbreviations
                continue  # Move to the next line

            # If in abbreviations section, extract valid abbreviation-long form pairs
            if abbreviations_found:
                match = re.match(r"^([A-Z0-9]{2,})\s*[-–—:]\s*(.+)$", line)  # Abbreviation - Long form
                if match:
                    abbreviations.append(match.group(1))  # Store the abbreviation
                else:
                    # If a capitalized word appears without a valid format, stop extraction (end of section)
                    if re.match(r"^[A-Z\s]+$", line) and len(line.split()) <= 5:
                        break

    # Determine sorting order
    abbreviations_sorted = check_sorting_order(abbreviations)
    print(abbreviations)
    return {
        "abbreviations_section": {
            "abbreviations_section_present": True if abbreviations else False,
            "abbreviations_sorted": abbreviations_sorted
        }
    }

def check_sorting_order(abbreviations):
    """Determine if abbreviations are sorted in ascending, descending, or no order."""
    if abbreviations == sorted(abbreviations):
        return "asc"
    elif abbreviations == sorted(abbreviations, reverse=True):
        return "desc"
    else:
        return "none"

# Example usage
abbreviations_data = abbreviations_extractor(pdf_path)

# Print the JSON output
print(json.dumps(abbreviations_data, indent=2))


['AI', 'API', 'BERT', 'CNN', 'CSV', 'GPU', 'HTTP', 'JSON', 'LR', 'ML', 'MNB', 'NLP', 'NLTK', 'ORM', 'REST', 'ROC', 'TF']
{
  "abbreviations_section": {
    "abbreviations_section_present": true,
    "abbreviations_sorted": "asc"
  }
}
