# Extract Tables

In [None]:
import pdfplumber

In [None]:
import camelot
syllabus = "files/SWE30003_Unit Outline_2024_S1.pdf"
test = "files/foo.pdf"

In [None]:
import numpy as np

tables = camelot.read_pdf(syllabus, pages="5", parallel=True)

from matplotlib import pyplot as plt

camelot.plot(tables[0], kind="contour")
plt.show()

# Extract Table Headings

In [None]:
def extract_table_heading(table : camelot.core.Table, page : pdfplumber.page.Page, top_margin : int = 40):

    # Bafflingly, the table's bounding box is a pseudo-private attribute
    bbox = table._bbox

    # Translate Camelot bbox co-ordinates to pdfplumber
    # Camelot has bottom-left origin whilst pdfplumber has top-left
    x1,y1,x2,y2 = bbox
    y1, y2 = page.height - y2, page.height - y1

    # Get the area directly above the table
    y1, y2 = y1 - top_margin, y1

    # To capture first letter we must add left-side padding to bbox
    x1 = max(x1 - 5, 0)

    bbox = (x1,y1,x2,y2)

    text = page.within_bbox(bbox).extract_text()
    
    return text

In [None]:
page_number = 5 # Counting from 1

tables = camelot.read_pdf(syllabus, pages=str(page_number), parallel=True)
pdf = pdfplumber.open(syllabus)
page = pdf.pages[page_number - 1] # Annoying inconsistency: pdfplumber page numbers are 0-indexed whilst Camelot is 1-indexed

heading = extract_table_heading(tables[1], page)

print(heading)

In [None]:
tables = camelot.read_pdf(syllabus, pages="all", parallel=True)

In [None]:
def matching_columns(a : camelot.core.Table, b: camelot.core.Table):
    return a.shape[1] == b.shape[1]

def group_contiguous_tables(tables : camelot.core.TableList, matching_function = matching_columns) -> list[list[camelot.core.Table]]:
    """
    Given a list of Camelot Tables, group contiguous tables which are separated by page breaks.
    ASSUMES that contiguous tables have the same number of columns.

    Args:
        tables (TableList): List of Camelot Tables.
        matching_function (Callable): Method to compare table A with table B to see if they are contiguous.

    Returns:
        groups (list[list[Table]]): List of table groups to be combined.
    """

    # Group the indices of all tables with the same number of columns
    groups, stack = [], []

    for i in range(len(tables)):
        if i == 0: continue

        prev_table, curr_table = tables[i-1], tables[i]

        if matching_function(prev_table, curr_table):
            if (i - 1) not in stack: stack.append(i - 1)
            stack.append(i)
        else:
            if stack:
                groups.append(stack)
            stack = []
    if stack: groups.append(stack)

    # Substitute indices for table objects
    for i, group in enumerate(groups):
        for j, index in enumerate(group):

            groups[i][j] = tables[index] 
    
    return groups

In [None]:
find_contiguous_tables(tables)