# Extract Tables

In [None]:
import pdfplumber

In [None]:
import camelot
syllabus = "files/SWE30003_Unit Outline_2024_S1.pdf"
test = "files/foo.pdf"

In [None]:
import numpy as np

tables = camelot.read_pdf(syllabus, pages="5", parallel=True)

from matplotlib import pyplot as plt

camelot.plot(tables[0], kind="contour")
plt.show()

# Extract Table Headings

In [None]:
def extract_table_heading(table : camelot.core.Table, page : pdfplumber.page.Page, top_margin : int = 40):

    # Bafflingly, the table's bounding box is a pseudo-private attribute
    bbox = table._bbox

    # Translate Camelot bbox co-ordinates to pdfplumber
    # Camelot has bottom-left origin whilst pdfplumber has top-left
    x1,y1,x2,y2 = bbox
    y1, y2 = page.height - y2, page.height - y1

    # Get the area directly above the table
    y1, y2 = y1 - top_margin, y1

    # To capture first letter we must add left-side padding to bbox
    x1 = max(x1 - 5, 0)

    bbox = (x1,y1,x2,y2)

    text = page.within_bbox(bbox).extract_text()
    
    return text

In [None]:
page_number = 5 # Counting from 1

tables = camelot.read_pdf(syllabus, pages=str(page_number), parallel=True)
pdf = pdfplumber.open(syllabus)
page = pdf.pages[page_number - 1] # Annoying inconsistency: pdfplumber page numbers are 0-indexed whilst Camelot is 1-indexed

heading = extract_table_heading(tables[1], page)

print(heading)

In [None]:
tables = camelot.read_pdf(syllabus, pages="all", parallel=True)

In [None]:
def group_contiguous_tables(tables : camelot.core.TableList) -> list[list[camelot.core.Table]]:
    """
    Group all tables which are broken by page breaks.
    
    ASSUMES contiguous tables share the same number of columns.

    Args:
        tables (TableList): List of Camelot Tables.

    Returns:
        contiguous_tables (list[list[Table]]): List of table groups to be combined.
    """
    
    columns = [table.shape[1] for table in tables]

    combined = []
    stack = []

    # Group indices where consecutive values match
    for i in range(len(columns)):
        if i == 0: continue

        curr_columns, prev_columns = columns[i], columns[i-1]
        
        if curr_columns == prev_columns:
            if (i - 1) not in stack: stack.append(i - 1)
            stack.append(i)
        else:
            if stack:
                combined.append(stack)
            stack = []
    if stack: combined.append(stack)

    # Substitute indices for table objects
    for i, group in enumerate(combined):
        for j, index in enumerate(group):

            combined[i][j] = tables[index] 
    
    return combined

In [None]:
find_contiguous_tables(tables)