# Extract Tables

In [None]:
import pdfplumber

In [None]:
import camelot
syllabus = "files/SWE30003_Unit Outline_2024_S1.pdf"
test = "files/foo.pdf"

In [None]:
import numpy as np

tables = camelot.read_pdf(syllabus, pages="5", parallel=True)

from matplotlib import pyplot as plt

camelot.plot(tables[0], kind="contour")
plt.show()

# Extract Table Headings

In [None]:
def extract_table_heading(table : camelot.core.Table, page : pdfplumber.page.Page, top_margin : int = 40):

    # Bafflingly, the table's bounding box is a pseudo-private attribute
    bbox = table._bbox

    # Translate Camelot bbox co-ordinates to pdfplumber
    # Camelot has bottom-left origin whilst pdfplumber has top-left
    x1,y1,x2,y2 = bbox
    y1, y2 = page.height - y2, page.height - y1

    # Get the area directly above the table
    y1, y2 = y1 - top_margin, y1

    # To capture first letter we must add left-side padding to bbox
    x1 = max(x1 - 5, 0)

    bbox = (x1,y1,x2,y2)

    text = page.within_bbox(bbox).extract_text()
    
    return text

In [None]:
page_number = 5 # Counting from 1

tables = camelot.read_pdf(syllabus, pages=str(page_number), parallel=True)
pdf = pdfplumber.open(syllabus)
page = pdf.pages[page_number - 1] # Annoying inconsistency: pdfplumber page numbers are 0-indexed whilst Camelot is 1-indexed

heading = extract_table_heading(tables[1], page)

print(heading)

# Combine Tables

In [None]:
tables = camelot.read_pdf(syllabus, pages="all", parallel=True)

In [None]:
from copy import deepcopy
from typing import Callable, List

import pandas as pd
from camelot.core import Table, TableList

def vstack_tables(x: Table, y: Table) -> Table:
    """Vertically concatenate two Camelot Table objects.

    This function returns a new Table, created as a deep copy of ``x``,
    with rows, cells, and data from ``y`` appended below it.
    Both tables must have the same number of columns.

    Args:
        x (Table): The first (top) table.
        y (Table): The second (bottom) table to append under ``x``.

    Returns:
        Table: A new Table containing rows from ``x`` followed by rows from ``y``.

    Raises:
        ValueError: If the two tables have different numbers of columns.
    """
    if len(x.cols) != len(y.cols):
        raise ValueError("Both tables must have the same number of columns")

    # Copy tables to prevent modifying originals
    a, b = deepcopy(x), deepcopy(y)

    # Calculate vertical shift for aligning b under a
    a_bottom = a._bbox[1]  # y1 (bottom) of a
    b_top = b._bbox[3]     # y2 (top) of b
    vertical_distance = a_bottom - b_top

    # Shift b.rows
    b.rows = [(r0 + vertical_distance, r1 + vertical_distance) for (r0, r1) in b.rows]

    # Shift b.cells
    for row in b.cells:
        for cell in row:
            cell.y1 += vertical_distance
            cell.y2 += vertical_distance

    # Append data
    a.rows.extend(b.rows)
    a.cells.extend(b.cells)
    a.data.extend(b.data)

    # Recompute attributes
    a.df = pd.DataFrame(a.data)
    a.shape = a.df.shape

    # Recompute bounding box to cover both tables
    ax1, ay1, ax2, ay2 = a._bbox
    bx1, by1, bx2, by2 = b._bbox
    by1 += vertical_distance
    by2 += vertical_distance

    new_x1 = min(ax1, bx1)
    new_x2 = max(ax2, bx2)
    new_y1 = min(ay1, by1)
    new_y2 = max(ay2, by2)
    a._bbox = (new_x1, new_y1, new_x2, new_y2)

    return a


def matching_columns(a: Table, b: Table) -> bool:
    """Check if two Camelot tables have the same number of columns.

    Args:
        a (Table): First table.
        b (Table): Second table.

    Returns:
        bool: True if both tables have the same number of columns, False otherwise.
    """
    return a.shape[1] == b.shape[1]


def group_contiguous_tables(
    tables: TableList,
    matching_function: Callable[[Table, Table], bool] = matching_columns,
) -> List[List[Table]]:
    """Group contiguous Camelot tables across page breaks.

    Contiguous tables are defined as adjacent tables in the input list that
    satisfy the provided matching function (by default, same number of columns).

    Args:
        tables (TableList): List of Camelot Table objects.
        matching_function (Callable[[Table, Table], bool], optional):
            Function to decide whether two adjacent tables should be grouped.
            Defaults to ``matching_columns``.

    Returns:
        List[List[Table]]: A list of table groups. Each group is a list of
        Table objects that should be considered contiguous.
    """
    groups: List[List[Table]] = []
    current_group: List[Table] = []

    for prev_table, curr_table in zip(tables, tables[1:]):
        if matching_function(prev_table, curr_table):
            if not current_group:  # start new group
                current_group.append(prev_table)
            current_group.append(curr_table)
        else:
            if current_group:  # finalize group
                groups.append(current_group)
                current_group = []

    if current_group:  # finalize last group
        groups.append(current_group)

    return groups


def stack_contiguous_tables(tables: TableList) -> TableList:
    """Stack groups of contiguous tables into single tables.

    Groups contiguous tables across page breaks (using
    :func:`group_contiguous_tables`) and vertically concatenates each group
    (using :func:`vstack_tables`). Returns a new TableList of merged tables.

    Args:
        tables (TableList): A list of Camelot Table objects.

    Returns:
        TableList: A new TableList where contiguous tables have been merged.
    """
    table_groups = group_contiguous_tables(tables)

    merged_tables: List[Table] = [
        vstack_tables(*group) if len(group) > 1 else group[0]
        for group in table_groups
    ]

    return TableList(merged_tables)


In [None]:
tables[0].df

In [None]:
tables_stacked = stack_contiguous_tables(tables)

In [None]:
tables_stacked[0].df

In [None]:
from copy import deepcopy
import pandas as pd

def vstack_tables(x : camelot.core.Table, y : camelot.core.Table) -> camelot.core.Table:
    """
    Vertically concatenate two Camelot Table objects which have the same number of columns.
    Returns a new Table (a deep copy of `x`) with rows/cells/data from `y` appended below it.

    Args:
        x (camelot.core.Table): The first (top) table.
        y (camelot.core.Table): The second (bottom) table to append under `x`.

    Returns:
        combined (camelot.core.Table): A new Table object containing rows from x followed by rows from y.
    """

    if len(x.cols) != len(y.cols): raise ValueError("Both tables must have the same number of columns")
        
    # Copy tables to prevent originals being modified
    a, b = deepcopy(x), deepcopy(y)
    
    rows = a.rows

    # Obtain vertical distance to shift table B so its top aligns with A's bottom
    a_bottom = a._bbox[1]   # y1 (bottom) of A
    b_top = b._bbox[3]      # y2 (top) of B
    vertical_distance = a_bottom - b_top

    # Offset the position of table B's rows so that they are directly underneath Table A's
    b.rows = [(row[0] + vertical_distance, row[1] + vertical_distance) for row in b.rows]

    # Do the same for table B's cells
    for i, cell_row in enumerate(b.cells):
        for j, cell in enumerate(cell_row):
            b.cells[i][j].y1 += vertical_distance
            b.cells[i][j].y2 += vertical_distance

    # Append table B's rows/cells/data to A
    a.rows.extend(b.rows)
    a.cells.extend(b.cells)
    a.data.extend(b.data)

    # Recalculate attributes
    a.df = pd.DataFrame(a.data)
    a.shape = a.df.shape

    # Recompute bounding box to cover both tables
    ax1, ay1, ax2, ay2 = a._bbox
    bx1, by1, bx2, by2 = b._bbox
    # Apply vertical shift to B's bbox y-values
    by1 += vertical_distance
    by2 += vertical_distance
    # Combine to create merged bbox
    a._bbox = (min(ax1, bx1), max(ax2, bx2), min(ay1, by1), max(ay2, by2))
    
    return a

In [None]:
lists = [vstack_tables(*group)for group in groups]

In [None]:
camelot.core.TableList(lists)[0]

In [None]:
table2 = camelot.core.Table(tables[0].cols, tables[0].rows)
table2._bbox = tables[0]._bbox
table2.page = tables[0].page
table2.order = tables[0].
table2