# Extract Tables

In [None]:
import pdfplumber

In [None]:
import camelot
syllabus = "files/SWE30003_Unit Outline_2024_S1.pdf"
test = "files/foo.pdf"

In [None]:
import numpy as np

tables = camelot.read_pdf(syllabus, pages="5", parallel=True)

from matplotlib import pyplot as plt

camelot.plot(tables[0], kind="contour")
plt.show()

# Extract Table Headings

In [None]:
def extract_table_heading(table : camelot.core.Table, page : pdfplumber.page.Page, top_margin : int = 40):

    # Bafflingly, the table's bounding box is a pseudo-private attribute
    bbox = table._bbox

    # Translate Camelot bbox co-ordinates to pdfplumber
    # Camelot has bottom-left origin whilst pdfplumber has top-left
    x1,y1,x2,y2 = bbox
    y1, y2 = page.height - y2, page.height - y1

    # Get the area directly above the table
    y1, y2 = y1 - top_margin, y1

    # To capture first letter we must add left-side padding to bbox
    x1 = max(x1 - 5, 0)

    bbox = (x1,y1,x2,y2)

    text = page.within_bbox(bbox).extract_text()
    
    return text

In [None]:
page_number = 5 # Counting from 1

tables = camelot.read_pdf(syllabus, pages=str(page_number), parallel=True)
pdf = pdfplumber.open(syllabus)
page = pdf.pages[page_number - 1] # Annoying inconsistency: pdfplumber page numbers are 0-indexed whilst Camelot is 1-indexed

heading = extract_table_heading(tables[1], page)

print(heading)

# Combine Tables

In [None]:
tables = camelot.read_pdf(syllabus, pages="1-4", parallel=True)

In [None]:
from copy import deepcopy
import pandas as pd

def vstack_tables(x : camelot.core.Table, y : camelot.core.Table) -> camelot.core.Table:
    """
    Vertically concatenate two Camelot Table objects which have the same number of columns.
    Returns a new Table (a deep copy of `x`) with rows/cells/data from `y` appended below it.

    Args:
        x (camelot.core.Table): The first (top) table.
        y (camelot.core.Table): The second (bottom) table to append under `x`.

    Returns:
        combined (camelot.core.Table): A new Table object containing rows from x followed by rows from y.
    """

    if len(x.cols) != len(y.cols): raise ValueError("Both tables must have the same number of columns")
        
    # Copy tables to prevent originals being modified
    a, b = deepcopy(x), deepcopy(y)
    
    rows = a.rows

    # Obtain vertical distance to shift table B so its top aligns with A's bottom
    a_bottom = a._bbox[1]   # y1 (bottom) of A
    b_top = b._bbox[3]      # y2 (top) of B
    vertical_distance = a_bottom - b_top

    # Offset the position of table B's rows so that they are directly underneath Table A's
    b.rows = [(row[0] + vertical_distance, row[1] + vertical_distance) for row in b.rows]

    # Do the same for table B's cells
    for i, cell_row in enumerate(b.cells):
        for j, cell in enumerate(cell_row):
            b.cells[i][j].y1 += vertical_distance
            b.cells[i][j].y2 += vertical_distance

    # Append table B's rows/cells/data to A
    a.rows.extend(b.rows)
    a.cells.extend(b.cells)
    a.data.extend(b.data)

    # Recalculate attributes
    a.df = pd.DataFrame(a.data)
    a.shape = a.df.shape

    # Recompute bounding box to cover both tables
    ax1, ay1, ax2, ay2 = a._bbox
    bx1, by1, bx2, by2 = b._bbox
    # Apply vertical shift to B's bbox y-values
    by1 += vertical_distance
    by2 += vertical_distance
    # Combine to create merged bbox
    a._bbox = (min(ax1, bx1), max(ax2, bx2), min(ay1, by1), max(ay2, by2))
    
    return a

In [None]:
def matching_columns(a : camelot.core.Table, b: camelot.core.Table):
    return a.shape[1] == b.shape[1]

def group_contiguous_tables(tables : camelot.core.TableList, matching_function = matching_columns) -> list[list[camelot.core.Table]]:
    """
    Given a list of Camelot Tables, group contiguous tables which are separated by page breaks.
    ASSUMES that contiguous tables have the same number of columns.

    Args:
        tables (TableList): List of Camelot Tables.
        matching_function (Callable): Method to compare table A with table B to see if they are contiguous.

    Returns:
        groups (list[list[Table]]): List of table groups to be combined.
    """

    # Group the indices of all tables with the same number of columns
    groups, stack = [], []

    for i in range(len(tables)):
        if i == 0: continue

        prev_table, curr_table = tables[i-1], tables[i]

        if matching_function(prev_table, curr_table):
            if (i - 1) not in stack: stack.append(i - 1)
            stack.append(i)
        else:
            if stack:
                groups.append(stack)
            stack = []
    if stack: groups.append(stack)

    # Substitute indices for table objects
    for i, group in enumerate(groups):
        for j, index in enumerate(group):

            groups[i][j] = tables[index] 

    return groups

def stack_contiguous_tables(tables : camelot.core.TableList) -> camelot.core.TableList:

    table_groups = group_contiguous_tables(tables)

    tables = [vstack_tables(*group)for group in groups]

    tables = camelot.core.TableList(tables)

    return tables

In [None]:
tables = stack_contiguous_tables(tables)

In [None]:
from copy import deepcopy
import pandas as pd

def vstack_tables(x : camelot.core.Table, y : camelot.core.Table) -> camelot.core.Table:
    """
    Vertically concatenate two Camelot Table objects which have the same number of columns.
    Returns a new Table (a deep copy of `x`) with rows/cells/data from `y` appended below it.

    Args:
        x (camelot.core.Table): The first (top) table.
        y (camelot.core.Table): The second (bottom) table to append under `x`.

    Returns:
        combined (camelot.core.Table): A new Table object containing rows from x followed by rows from y.
    """

    if len(x.cols) != len(y.cols): raise ValueError("Both tables must have the same number of columns")
        
    # Copy tables to prevent originals being modified
    a, b = deepcopy(x), deepcopy(y)
    
    rows = a.rows

    # Obtain vertical distance to shift table B so its top aligns with A's bottom
    a_bottom = a._bbox[1]   # y1 (bottom) of A
    b_top = b._bbox[3]      # y2 (top) of B
    vertical_distance = a_bottom - b_top

    # Offset the position of table B's rows so that they are directly underneath Table A's
    b.rows = [(row[0] + vertical_distance, row[1] + vertical_distance) for row in b.rows]

    # Do the same for table B's cells
    for i, cell_row in enumerate(b.cells):
        for j, cell in enumerate(cell_row):
            b.cells[i][j].y1 += vertical_distance
            b.cells[i][j].y2 += vertical_distance

    # Append table B's rows/cells/data to A
    a.rows.extend(b.rows)
    a.cells.extend(b.cells)
    a.data.extend(b.data)

    # Recalculate attributes
    a.df = pd.DataFrame(a.data)
    a.shape = a.df.shape

    # Recompute bounding box to cover both tables
    ax1, ay1, ax2, ay2 = a._bbox
    bx1, by1, bx2, by2 = b._bbox
    # Apply vertical shift to B's bbox y-values
    by1 += vertical_distance
    by2 += vertical_distance
    # Combine to create merged bbox
    a._bbox = (min(ax1, bx1), max(ax2, bx2), min(ay1, by1), max(ay2, by2))
    
    return a

In [None]:
lists = [vstack_tables(*group)for group in groups]

In [None]:
camelot.core.TableList(lists)[0]

In [None]:
table2 = camelot.core.Table(tables[0].cols, tables[0].rows)
table2._bbox = tables[0]._bbox
table2.page = tables[0].page
table2.order = tables[0].
table2