# Extract Connections List and Device tag lists

In [6]:
import pymupdf  # import PyMuPDF

if not hasattr(pymupdf.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

In [7]:
doc = pymupdf.open("pdfs/sample.pdf")
page = doc[29]

# Visualization functions
should go into common debug lib

In [323]:
def show_as_image(item, title=""):
    """Display item as pixmap."""
    DPI = 300  # use this resolution
    import numpy as np
    import matplotlib.pyplot as plt

    # %matplotlib inline
    pix = item.get_pixmap(dpi=DPI)
    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
    plt.figure(dpi=DPI)  # set the figure's DPI
    plt.title(title)  # set title of image
    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))


def show_table_overview(page, tables):
    for i, tab in enumerate(tables):  # iterate over all tables
        for cell in tab.header.cells:
            if cell is not None:
                page.draw_rect(cell, color=pymupdf.pdfcolor["red"], width=2)
        for r, row in enumerate(tab.rows):
            for c, cell in enumerate(row.cells):
                if cell is not None:
                    page.draw_rect(cell, color=(0, 0, 1), width=1)  # blue
        page.draw_rect(tab.bbox, color=pymupdf.pdfcolor["green"])
        print(
            f"Table {i} column names: {tab.header.names}, external: {tab.header.external}"
        )
    show_as_image(page, f"Table & Header BBoxes")


def show_text_overview(page, text):
    for block in text:
        x0, y0, x1, y1, text, _, _, _ = block
        rect = pymupdf.Rect(x0, y0, x1, y1)
        page.draw_rect(rect, color=(1, 0, 0), width=1)  # red rectangle
    show_as_image(page, f"Table & Header BBoxes")


def print_table_overview(tables):
    for i, tab in enumerate(tables):  # iterate over all tables
        print(f"Table {i} column names: {tab.header.names}")

# Core logic code
call `extract` with `page`s list and `TableType`.
It will return single dataframe with a header.
It will PRINT! error and ignore data from the erroneous page (TODO)
It will NOT catch error if incorrect page type is provided

In [14]:
import pandas as pd
import numpy as np

# In pt
PAPER_A3 = (1191.05, 842.39)


def detect_overlaps(text_blocks):
    overlaps = []

    for i, (x0_i, y0_i, x1_i, y1_i, text_i, _, _, _) in enumerate(text_blocks):
        for j, (x0_j, y0_j, x1_j, y1_j, text_j, _, _, _) in enumerate(text_blocks):
            if i >= j:
                continue  # avoid double-checking same pair
            # check for intersection
            if not (x1_i <= x0_j or x1_j <= x0_i or y1_i <= y0_j or y1_j <= y0_i):
                overlaps.append(
                    (text_i, text_j, (x0_i, y0_i, x1_i, y1_i), (x0_j, y0_j, x1_j, y1_j))
                )

    return overlaps


# TODO just for now (then try to fix errorxs)
def detect_row_overlaps(table, overlaps):
    """
    Given a table and a list of overlapping text tuples,
    determine which table rows are affected.
    """
    affected_rows = []

    for text1, text2, rect1, rect2 in overlaps:
        # Iterate through table cells
        # print()
        # print(rect1)
        # print(rect2)
        for r, row in enumerate(table.rows):
            rect = pymupdf.Rect(row.bbox)
            # print(f" {r} {row.bbox}")
            if rect.intersects(pymupdf.Rect(rect1)) or rect.intersects(
                pymupdf.Rect(rect2)
            ):
                # print("!!!")
                affected_rows.append(r)
            # for c, cell in enumerate(row.cells):
            #     if cell is None:
            #         continue
            #     rect = pymupdf.Rect(cell)
            #     # Check if either overlap rect intersects the cell rect
            #     if rect.intersects(pymupdf.Rect(rect1)) or rect.intersects(pymupdf.Rect(rect2)):
            #         if r not in affected_rows:
            #             affected_rows = []
            #         affected_rows.append(())

    return affected_rows


def demote_header(df, header=None):
    if header is None:
        header = [""] * len(df.columns)
    header_row = pd.DataFrame([df.columns], columns=header)
    df2 = pd.DataFrame(df.values, columns=header)
    return pd.concat([header_row, df2], ignore_index=True)


def promote_header(df):
    return pd.DataFrame(df.values[1:], columns=df.values[0])


def forward_fill(df, column, replacement="="):
    saved = None
    new_values = []
    for val in df[column]:
        if val != replacement:
            saved = val
            new_values.append(val)
        else:
            if saved is None:
                raise ValueError(
                    f"Forward fill table column {column} has {replacement} before any value was specified"
                )
            new_values.append(saved)
    df[column] = new_values
    return df


def getClipRect(w, h, x0, y0, w0, h0):
    return pymupdf.Rect(
        x0 / PAPER_A3[0] * w,
        y0 / PAPER_A3[1] * h,
        w0 / PAPER_A3[0] * w,
        h0 / PAPER_A3[1] * h,
    )


def extract(pages, what):
    print(f"Extracting '{what}' from {len(pages)} pages...")
    if len(pages) == 0:
        return None
    #
    f = options.get(what, None)
    assert f is not None, f"Specified table type '{what}' does not have a processor"
    # concat the rest
    tables = []
    for p in pages:
        print(f"Extracting '{what}' from page #{p.number + 1}")
        try:
            t = f(p)
            if t is not None:
                tables.append(t)
            else:
                print(f"Could not extract '{what}' from page #{p.number + 1}: got None")
        except ValueError as ve:
            print(f"ValueError extracting '{what}' from page #{p.number + 1}: {ve}")
        except Exception as e:
            print(
                f"Unexpected error extracting '{what}' from page #{p.number + 1}: {e}"
            )
    #
    if len(tables) == 0:
        return None
    df = pd.concat(tables, ignore_index=True)
    assert df.shape[1] == tables[0].shape[1], f"Table headers do not match"
    # translate table header (for now just to english)
    if what in header_map:
        new_columns = header_map[what]
        if len(new_columns) == df.shape[1]:
            df.columns = new_columns
        else:
            print(
                f"Internationalization error: table shape mismatch: {len(new_columns)} vs {df.shape[1]}"
            )

    return df


def extract_connection_list(page):
    tables = list(page.find_tables())
    if len(tables) < 2:
        raise ValueError("No required tables found on the page")

    # TODO selection logic
    t1 = tables[1]
    t2 = tables[2] if len(tables) > 2 else None

    if t1 is None:
        return None

    # Preserve header - to_pandas breaks empty fields
    header = list(t1.header.names)
    df1 = t1.to_pandas()

    # Single table case
    if t2 is None:
        return df1

    # Two tables case - combine them
    if t1.col_count != t2.col_count:
        raise ValueError(
            f"Column count mismatch between tables: {t1.col_count} vs {t2.col_count}"
        )

    header_row = pd.DataFrame([t2.header.names], columns=header)
    df2 = pd.DataFrame(t2.to_pandas().values, columns=header)

    return pd.concat([df1, header_row, df2], ignore_index=True)


def extract_device_tag_list_de(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")

    # TODO cliprect check
    tables = list(page.find_tables(clip=getClipRect(w, h, 33, 132, 1170, 780)))
    if not tables:
        raise ValueError("No tables found in the specified clip area on the page")
    return tables[0].to_pandas()


def extract_device_tag_list(page):
    tables = list(page.find_tables())
    if not tables:
        raise ValueError("No tables found on the page")
    # TODO selection logic
    t = tables[1]
    df = t.to_pandas()
    #
    return pd.DataFrame(df.values[1:], columns=df.values[0])  # use 1st row as header


def extract_cable_overview(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")

    tables = list(page.find_tables(clip=getClipRect(w, h, 33, 33, 1170, 780)))
    if not tables:
        raise ValueError("No tables found on the page")
    df = tables[0].to_pandas()

    # use 1st row as header
    df = pd.DataFrame(df.values[1:], columns=df.values[0])

    # drop empty/None cols
    df = df.drop(columns=[col for col in df.columns if col is None or col == ""])

    # disjoin "from to" column
    col_to_drop = df.columns[1]
    split_cols = df[col_to_drop].str.split(" ", expand=True)
    if split_cols.shape[1] != 2:
        raise ValueError(
            f"Expected at most 2 columns after split, got {split_cols.shape[1]}, meaning some name had spaces in it!"
        )
    df[["from", "to"]] = split_cols
    df = df.drop(columns=[col_to_drop])
    #
    return df


def extract_cable_plan(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")
    #
    # Will not work for German!
    # extract connections
    tables_left = list(page.find_tables(clip=getClipRect(w, h, 10, 98, 400, 780)))
    if not tables_left:
        raise ValueError("No required tables found on the page")
    df_left = tables_left[0].to_pandas()
    tables_right = list(page.find_tables(clip=getClipRect(w, h, 790, 98, 1185, 780)))
    if not tables_right:
        raise ValueError("No required tables found on the page")
    df_right = tables_right[0].to_pandas()
    df = pd.concat([df_left, df_right], axis=1)
    #
    # extract current device
    # 80 for English; 136 German!
    text = page.get_text("text", clip=getClipRect(w, h, 114, 29, 400, 47))
    df["Cabel"] = text
    if len(text) == 0:
        raise ValueError("Failed to detect Cabel Tag")
    # other TODO unstable
    # tables_main = page.find_tables(strategy="text", clip=getClipRect(w, h, 33, 132, 1170, 203))
    # 120 for English; 230 German!
    tables_typ = list(
        page.find_tables(strategy="text", clip=getClipRect(w, h, 441, 120, 750, 780))
    )
    typ = None
    if tables_typ:
        typ_header = list(
            tables_typ[0].header.names
        )  # Preserve header - to_pandas breaks it
        typ = tables_typ[0].to_pandas()
        typ.columns = typ_header  # restore original names (.to_pandas breaks it)
        typ = demote_header(typ, ["Source conductor", "Target conductor"])
        typ = typ.iloc[::2].reset_index(drop=True)  # remove empty rows
    else:
        # fallback
        text_typ = page.get_text("text", clip=getClipRect(w, h, 441, 120, 750, 780))
        rows = [line for line in text_typ.split("\n") if line.strip()]  # flat list
        rows = [
            rows[i : i + 2] for i in range(0, len(rows), 2)
        ]  # Group every 2 items into a row
        typ = pd.DataFrame(rows, columns=["Source conductor", "Target conductor"])
    df = pd.concat([df, typ], axis=1)

    # fix

    # print_table_overview(tables)
    # print_table_overview(tables_typ)
    return df


def extract_topology(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")

    tables = list(page.find_tables(clip=getClipRect(w, h, 33, 75, 1170, 780)))
    # print_table_overview(tables)
    if not tables:
        raise ValueError("No required tables found on the page")

    return tables[0].to_pandas()


def extract_wires_part_list(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")

    tables = list(page.find_tables(clip=getClipRect(w, h, 33, 70, 1170, 780)))
    # print_table_overview(tables)
    if not tables:
        raise ValueError("No required tables found on the page")

    df = tables[0].to_pandas()
    return pd.DataFrame(df.values[1:], columns=df.values[0])  # use 1st row as header


def extract_cable_diagram(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")
    #
    tables = list(page.find_tables(clip=getClipRect(w, h, 33, 70, 1170, 780)))
    if not tables:
        raise ValueError("No required tables found on the page")
    # No way I can separate tables by coods - need to detect here
    df = tables[0].to_pandas()
    df = df.replace({None: np.nan})
    i = 0
    tables = []
    while i < len(df):
        # Detect start of a block: two rows with col2 & col3 missing
        if df.iloc[i, 1:3].isna().all() and df.iloc[i + 1, 1:3].isna().all():
            cable_name = df.iloc[i, 0].split(" ")[-1]  # Here cable name is located
            i += 2
            # Ignore all info as duplicated and hard to extract

            header = df.iloc[i].tolist()
            i += 1

            # Gather table rows until next info block or end
            rows = []
            while i < len(df) and not (
                df.iloc[i, 1:3].isna().all()
                and i + 1 < len(df)
                and df.iloc[i + 1, 1:3].isna().all()
            ):
                rows.append(df.iloc[i].tolist())
                i += 1

            table = pd.DataFrame(rows, columns=header)
            table["Cable"] = cable_name
            tables.append(table)
        else:
            i += 1

    return pd.concat(tables, ignore_index=True)


def extract_terminal_diagram(page):
    w = page.rect.width
    h = page.rect.height
    if w <= h:
        raise ValueError(f"Album orientation expected, found: width={w}, height={h}.")
    # Left side Cables
    tables = list(page.find_tables(clip=getClipRect(w, h, 20, 33, 410, 237)))
    if not tables:
        raise ValueError("No required tables found on the page: left cable info")
    header = list(tables[0].header.names)  # preserve header as to_pandas breaks it
    df = tables[0].to_pandas()
    df.columns = header  # restore original names (.to_pandas breaks it)
    df_l_cables = demote_header(df)
    # Left side Cable Connections
    tables = list(page.find_tables(clip=getClipRect(w, h, 20, 237, 410, 780)))
    if not tables:
        raise ValueError("No required tables found on the page: left color info")
    df_l_conn = tables[0].to_pandas()
    # Connections
    tables = list(page.find_tables(clip=getClipRect(w, h, 410, 33, 780, 780)))
    if not tables:
        raise ValueError("No required tables found on the page: connection info")
    overlaps = detect_overlaps(
        page.get_text("words", clip=getClipRect(w, h, 410, 33, 780, 780))
    )
    overlapping_rows = []
    if overlaps:
        overlapping_rows = detect_row_overlaps(tables[0], overlaps)
    df_center = tables[0].to_pandas()
    # Right side Cables
    tables = list(page.find_tables(clip=getClipRect(w, h, 780, 33, 1170, 237)))
    if not tables:
        raise ValueError("No required tables found on the page: right cable info")
    header = list(tables[0].header.names)  # preserve header as to_pandas breaks it
    df = tables[0].to_pandas()
    df.columns = header  # restore original names (.to_pandas breaks it)
    df_r_cables = demote_header(df)
    # Right side Cable Connections
    tables = list(page.find_tables(clip=getClipRect(w, h, 780, 237, 1170, 780)))
    if not tables:
        raise ValueError("No required tables found on the page: left color info")
    df_r_conn = tables[0].to_pandas()

    # here I have to do preprocessing to make a single df
    strip_info = df_center.columns[0]
    strip_name = strip_info.splitlines()[1]

    df = promote_header(df_center)

    # For now I will just delete overlapping rows
    if overlapping_rows:
        print("OVERLAP DETECTED! ERROR HANDLING IS NOT DONE YET! FOR NOW JUST IGNORED!")
        # -2 as promote_header & index counts from 1 in detect_row_overlaps
        adjusted_rows = [i - 2 for i in overlapping_rows]
        df = df.drop(index=adjusted_rows).reset_index(drop=True)
        df_l_conn = df_l_conn.drop(index=adjusted_rows).reset_index(drop=True)
        df_r_conn = df_r_conn.drop(index=adjusted_rows).reset_index(drop=True)

    #
    def transform_dataframe(df_cables, df_conn):
        rows = []
        number_cols = [col for col in df_conn.columns if col.isdigit()]
        non_number_cols = [col for col in df_conn.columns if col not in number_cols]
        columns = ["Cable", "Color"] + non_number_cols
        # for each connection
        for _, row in df_conn.iterrows():
            non_number_values = row[non_number_cols].tolist()
            cable_info_list = []
            color_list = []
            for col in number_cols:
                color = row[col]
                if pd.notna(color) and color.strip() != "":
                    # assume it is convertible to int as we did isdigit
                    cable_index = int(col) - 1
                    # TODO might be wrong as cable tag might have spaces (?)
                    cable_info = df_cables.iloc[cable_index, 1].split(" ")[0]
                    # Extract a TAG from cable_info
                    cable_info_list.append(cable_info)
                    color_list.append(color)
            rows.append(
                ["; ".join(cable_info_list), "; ".join(color_list)] + non_number_values
            )
        return pd.DataFrame(rows, columns=columns)

    # Check if the number of rows in the transformed dataframes matches the number of rows in df
    left_transformed = transform_dataframe(df_l_cables, df_l_conn)
    right_transformed = transform_dataframe(df_r_cables, df_r_conn)
    if left_transformed.shape[0] != df.shape[0]:
        raise ValueError(
            f"Left cable assignment ({left_transformed.shape[0]}) does not match connections ({df.shape[0]})"
        )
    if right_transformed.shape[0] != df.shape[0]:
        raise ValueError(
            f"Right cable assignment ({right_transformed.shape[0]}) does not match connections ({df.shape[0]})"
        )

    # Prepend left_transformed, append right_transformed
    df = pd.concat(
        [
            left_transformed.reset_index(drop=True),
            df.reset_index(drop=True),
            right_transformed.reset_index(drop=True),
        ],
        axis=1,
    )

    # clean empty rows
    df = df.where(df != "").dropna(how="all")
    # insert strip name as 1st column
    df.insert(0, "Strip", strip_name)

    return df


from enum import Enum


class TableType(Enum):
    CONNECTION_LIST = "connection list"
    DEVICE_TAG_LIST = "device tag list"
    DEVICE_LIST_DE = "artikelstückliste"
    CABLE_OVERVIEW = "Cable overview | Kabelübersicht"
    CABLE_PLAN = "Cable plan | Kabelplan"
    TOPOLOGY = "Topology: Routed cables / connections"
    WIRES_PART_LIST = "Wires parts list"
    TERMINAL_DIAGRAM = "Terminal diagram | Klemmenplan"
    CABLE_DIAGRAM = "Cable diagram"


options = {
    TableType.CONNECTION_LIST: extract_connection_list,
    TableType.DEVICE_TAG_LIST: extract_device_tag_list,
    TableType.DEVICE_LIST_DE: extract_device_tag_list_de,
    TableType.CABLE_OVERVIEW: extract_cable_overview,
    TableType.CABLE_PLAN: extract_cable_plan,
    TableType.TOPOLOGY: extract_topology,
    TableType.WIRES_PART_LIST: extract_wires_part_list,
    TableType.TERMINAL_DIAGRAM: extract_terminal_diagram,
    TableType.CABLE_DIAGRAM: extract_cable_diagram,
}

header_map = {
    TableType.CABLE_OVERVIEW: [
        "Cable designation",
        "Cable type",
        "Conductors",
        "ø",
        "Length",
        "Function text",
        "From",
        "To",
    ],
}

# Examples

In [31]:
from table_extractor import TableExtractor


table_extractor = TableExtractor()

In [33]:
from page_processor import PageType

table_extractor.extract(doc[152:153], PageType.CABLE_PLAN_DE)

Extracting 'PageType.CABLE_PLAN_DE' from 1 pages...
Extracting 'PageType.CABLE_PLAN_DE' from page #153


Unnamed: 0,ocation designation,Source,Function text source,Stopper,L1\n[mm],Part number,Part number.1,L2\n[mm],Stopper.1,Function text target,Target,Location designation,Cabel,Source conductor,Target conductor
0,"Station ""Work""",+B2.X1-XZ1:23,"Drive, ""Grinding, right""",,10 mm,,,10 mm,,"Drive, ""Grinding, right""",=TM2+B2-MA1:U1,"Station ""Work""",+B2.X1-WZ5\n,1,1
1,"Station ""Work""",+B2.X1-XZ1:24,=,,10 mm,,,10 mm,,=,=TM2+B2-MA1:V1,"Station ""Work""",+B2.X1-WZ5\n,2,2
2,"Station ""Work""",+B2.X1-XZ1:25,=,,10 mm,,,10 mm,,=,=TM2+B2-MA1:W1,"Station ""Work""",+B2.X1-WZ5\n,3,3
3,"Station ""Work""",+B2.X1-XZ1:26,=,,10 mm,,,10 mm,,=,=TM2+B2-MA1:PE,"Station ""Work""",+B2.X1-WZ5\n,GNYE,GNYE
4,"Station ""Work""",+B2.X1-XZ1:28,=,,,,,,,,=TM2+B2-MA1:11,"Station ""Work""",+B2.X1-WZ5\n,5,5
5,"Station ""Work""",+B2.X1-XZ1:29,=,,,,,,,,=TM2+B2-MA1:12,"Station ""Work""",+B2.X1-WZ5\n,6,6
6,"Station ""Work""",+B2.X1-XZ1:27,=,,,,,,,"Drive, ""Grinding, right""",+B2.X1-WZ5:SH1,"Station ""Work""",+B2.X1-WZ5\n,SH1,SH1
7,"Station ""Work""",+B2.X1-XZ1:30,=,,,,,,,=,+B2.X1-WZ5:SH,"Station ""Work""",+B2.X1-WZ5\n,SH,SH


In [398]:
extract(doc[72:76], TableType.CABLE_DIAGRAM)

Extracting 'TableType.CABLE_DIAGRAM' from 4 pages...
Extracting 'TableType.CABLE_DIAGRAM' from page #73
Extracting 'TableType.CABLE_DIAGRAM' from page #74
Extracting 'TableType.CABLE_DIAGRAM' from page #75
Extracting 'TableType.CABLE_DIAGRAM' from page #76


Unnamed: 0,Function text,Page / column,Target designation from,Connection\npoint,Conductor,Target designation to,Connection\npoint.1,Page / column.1,Function text.1,Cable
0,24 V device power supply,=GAB1+A2&EFS1/1.1,+A2-XD1,4,BU,+A1-XD3,4,=GAB1+A2&EFS1/1.1,24 V device power supply,+A2-WD1
1,=,=GAB1+A2&EFS1/1.1,+A2-XD1,1,BN,+A1-XD3,1,=GAB1+A2&EFS1/1.1,=,+A2-WD1
2,=,=GAB1+A2&EFS1/1.1,+A2-XD1,2,BK,+A1-XD3,2,=GAB1+A2&EFS1/1.1,=,+A2-WD1
3,=,=GAB1+A2&EFS1/1.1,+A2-XD1,3,GY,+A1-XD3,3,=GAB1+A2&EFS1/1.1,=,+A2-WD1
4,=,=GAB1+A2&EFS1/1.1,+A2-XD1,5,GNYE,+A1-XD3,5,=GAB1+A2&EFS1/1.1,=,+A2-WD1
...,...,...,...,...,...,...,...,...,...,...
59,=,=TM1+A1&EFS1/1.9,+A1-XG1,26,BU,+A2-XG5,50,=TM1+A1&EFS1/1.9,=,+A2-WG2
60,,,,,GNYE,,,,,+A2-WG2
61,"Start ""Grinding, right""",=TM2+A1&EFS1/1.9,+A1-XG1,29,BN,+A2-XG5,55,=TM2+A1&EFS1/1.9,"Start ""Grinding, right""",+A2-WG3
62,=,=TM2+A1&EFS1/1.9,+A1-XG1,30,BU,+A2-XG5,56,=TM2+A1&EFS1/1.9,=,+A2-WG3


In [None]:
extract(doc[58:66], TableType.TERMINAL_DIAGRAM)
# extract(doc[169:170], TableType.TERMINAL_DIAGRAM)

Extracting 'TableType.TERMINAL_DIAGRAM' from 1 pages...
Extracting 'TableType.TERMINAL_DIAGRAM' from page #170
OVERLAP DETECTED! ERROR HANDLING IS NOT DONE YET! FOR NOW JUST IGNORED!


Unnamed: 0,Strip,Cable,Color,Function text,Target designation,Connection point,Terminal designation,Jumpers,PLC connection point,Target designation.1,Connection point.1,Cable.1,Color.1,Page / column
0,+B2.Y1-XD1,+B2.Y1-WD1,BK,Power supply valve terminal,+A2-XD7,16.0,1,,,+B2.Y1-XDB1,1:1,+B2.Y1-WD3,BK,=K2+B2.Y1&EFS1/3.3
1,+B2.Y1-XD1,+B2.Y1-WD1,GNYE,=,+A2-XD7,19.0,4,,,+B2.Y1-XDB1,PE:PE,+B2.Y1-WD3,GNYE,=K2+B2.Y1&EFS1/3.3
2,+B2.Y1-XD1,,,,,,5,,,=K2+B2.Y1-XD1,PE,,,=K2+B2.Y1&EFS1/3.2


In [402]:
df = extract(doc[40:44], TableType.WIRES_PART_LIST)
df

Extracting 'TableType.WIRES_PART_LIST' from 4 pages...
Extracting 'TableType.WIRES_PART_LIST' from page #41
Extracting 'TableType.WIRES_PART_LIST' from page #42
Extracting 'TableType.WIRES_PART_LIST' from page #43
Extracting 'TableType.WIRES_PART_LIST' from page #44


Unnamed: 0,Source,Target,Part number,Cross-section,Color,Length,Bundle,Bundle groups,Conductor\nend source,Conductor\nend target,Routing direction\nsource,Routing direction\ntarget,Routing track
0,+A1-XD1:1,=GAA+A1-FC1:1,LAPP.4150701,6,BK,"2,31 m",7,,Stripping,End sleeve,"Move up, to the left","Move up, to the left",+A1-U12;+A1-U5;+A1-U11;+A1-U9;+A1-U10;+A1-TA1;...
1,=GAA+A1-FC1:1,=EA+A1-FB1:1,LAPP.4150709,6,OG,"0,534 m",5,,End sleeve,End sleeve,"Move up, to the left","Move up, to the right",+A1-U7
2,=EA+A1-FB1:1,=EA+A1-FB3:1,LAPP.4150709,6,OG,"0,369 m",5,,End sleeve,End sleeve,"Move up, to the right","Move up, to the left",+A1-U7
3,=GAA+A1-FC1:2,=GAA+A1-FC2:1,LAPP.4150601,4,BK,"1,066 m",6,,End sleeve,End sleeve,"Move down, to the left","Move up, to the left",+A1-U8;+A1-U5;+A1-U7
4,=GAA+A1-FC2:1,=GAB1+A1-FC1:1/L1,LAPP.4150601,4,BK,"1,499 m",6,,End sleeve,End sleeve,"Move up, to the left","Move up, to the left",+A1-U7;+A1-U5;+A1-U8;+A1-U9;+A1-U10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,,,,,,,,,,,,,
224,,,,,,,,,,,,,
225,,,,,,,,,,,,,
226,,,,,,,,,,,,,


In [403]:
df = extract(doc[40:44], TableType.WIRES_PART_LIST)
df

Extracting 'TableType.WIRES_PART_LIST' from 4 pages...
Extracting 'TableType.WIRES_PART_LIST' from page #41
Extracting 'TableType.WIRES_PART_LIST' from page #42
Extracting 'TableType.WIRES_PART_LIST' from page #43
Extracting 'TableType.WIRES_PART_LIST' from page #44


Unnamed: 0,Source,Target,Part number,Cross-section,Color,Length,Bundle,Bundle groups,Conductor\nend source,Conductor\nend target,Routing direction\nsource,Routing direction\ntarget,Routing track
0,+A1-XD1:1,=GAA+A1-FC1:1,LAPP.4150701,6,BK,"2,31 m",7,,Stripping,End sleeve,"Move up, to the left","Move up, to the left",+A1-U12;+A1-U5;+A1-U11;+A1-U9;+A1-U10;+A1-TA1;...
1,=GAA+A1-FC1:1,=EA+A1-FB1:1,LAPP.4150709,6,OG,"0,534 m",5,,End sleeve,End sleeve,"Move up, to the left","Move up, to the right",+A1-U7
2,=EA+A1-FB1:1,=EA+A1-FB3:1,LAPP.4150709,6,OG,"0,369 m",5,,End sleeve,End sleeve,"Move up, to the right","Move up, to the left",+A1-U7
3,=GAA+A1-FC1:2,=GAA+A1-FC2:1,LAPP.4150601,4,BK,"1,066 m",6,,End sleeve,End sleeve,"Move down, to the left","Move up, to the left",+A1-U8;+A1-U5;+A1-U7
4,=GAA+A1-FC2:1,=GAB1+A1-FC1:1/L1,LAPP.4150601,4,BK,"1,499 m",6,,End sleeve,End sleeve,"Move up, to the left","Move up, to the left",+A1-U7;+A1-U5;+A1-U8;+A1-U9;+A1-U10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,,,,,,,,,,,,,
224,,,,,,,,,,,,,
225,,,,,,,,,,,,,
226,,,,,,,,,,,,,


In [404]:
df = extract(doc[55:56], TableType.CABLE_OVERVIEW)
df

Extracting 'TableType.CABLE_OVERVIEW' from 1 pages...
Extracting 'TableType.CABLE_OVERVIEW' from page #56


Unnamed: 0,Cable designation,Cable type,Conductors,ø,Length,Function text,From,To
0,+A1-WD1,NYY-O,2X,6.0,2472.0,Power supply,+A1-WE2,+A2-WE2
1,,,,,,,+A1-WE1,+A2-WE1
2,+A1-WD2,,3G,15.0,2385.0,Enclosure light 1,+A1-XD2,=EA+A1-EA1
3,,,,,,,,


In [405]:
df = extract(doc[21:23], TableType.TOPOLOGY)
df

Extracting 'TableType.TOPOLOGY' from 2 pages...
Extracting 'TableType.TOPOLOGY' from page #22
Extracting 'TableType.TOPOLOGY' from page #23


Unnamed: 0,Designation,Part number,Type number,Length [m],Source,Source connection length [m],Routing track,Target,Target connection length [m]
0,+A1-WD1,EPL.2x6CABLE,NYY-O,247,+A1-WE2;+A1-WE1,38,+A1-U2;+-U85;+-U87;+A2-U2,+A2-WE2;+A2-WE1,284.0
1,+A2-WD1,LAPP.00100664,ÖLFLEX® CLASSIC 100,224,+A2-XD1,47,+-U133;+A1-U2;+-U85;+-U87;+A2-U2;+-U140,+A1-XD3,47.0
2,+A2-WD2,LAPP.00100654,ÖLFLEX® CLASSIC 100,224,+A2-XD1,47,+-U133;+A1-U2;+-U85;+-U87;+A2-U2;+-U140,+A1-XD3,47.0
3,+A2-WD3,LAPP.0010064,ÖLFLEX® CLASSIC 100,224,+A2-XD5,47,+-U132;+A1-U2;+-U85;+-U87;+A2-U2;+-U143,+A1-XD2,47.0
4,+A2-WD4,LAPP.0010064,ÖLFLEX® CLASSIC 100,224,+A2-XD5,47,+-U132;+A1-U2;+-U85;+-U87;+A2-U2;+-U143,+A1-XD2,47.0
5,+A2-WG1,LAPP.0010036,ÖLFLEX® CLASSIC 100,224,+A1-XG1,47,+-U135;+A1-U2;+-U85;+-U87;+A2-U2;+-U141,+A2-XG1,47.0
6,+A2-WG2,LAPP.00100224,ÖLFLEX® CLASSIC 100,224,+A1-XG1,47,+-U135;+A1-U2;+-U85;+-U87;+A2-U2;+-U142,+A2-XG5,47.0
7,+A2-WG3,LAPP.00100224,ÖLFLEX® CLASSIC 100,224,+A1-XG1,47,+-U135;+A1-U2;+-U85;+-U87;+A2-U2;+-U142,+A2-XG5,47.0
8,+B1-WG2,LAPP.00101264,ÖLFLEX® CLASSIC 100,402,=F+B1-SG1,51,+-U144;+A2-U2;+-U87;+-U85;+-U82;+B1.X1-U101;+-...,+A2-XG4,47.0
9,+B1.X1-WD1,LAPP.0011028,ÖLFLEX® 140,340,+B1.X1-XD1,47,+-U145;+A2-U2;+-U87;+-U85;+-U82;+-U94,+A2-XD7,47.0


In [93]:
doc_de2 = pymupdf.open("pdfs/machine_building_de.pdf")
df = extract(doc_de2[88:90], TableType.CABLE_OVERVIEW)
df
# extract_cable_overview(doc_de2[88])

Extracting 'TableType.CABLE_OVERVIEW' from 2 pages...
Extracting 'TableType.CABLE_OVERVIEW' from page #89
Table 0 column names: ['', '', None, None, None, None, None, None]
Extracting 'TableType.CABLE_OVERVIEW' from page #90
Table 0 column names: ['', '', None, None, None, None, None, None]


Unnamed: 0,Cable designation,Cable type,Conductors,ø,Length,Function text,From,To
0,=GAA+UC1-WXD1,YAKY,3,15.0,3.0,Schaltschrankbeleuchtung,=GAA+UC1-XDA3,=GAA+UC1-EAA1
1,=GQA+UC1-WDB1,ÖLFLEX® CLASSIC 110,3,15.0,10.0,Hauptventil ein,=GQA+UC1-XDA1,=GQA+BB1-QM1-MB1
2,=GQA+BB1-WGA1,SAC-3P-M12MR-M12FR/.../...,3,34.0,5.0,Luftdruck ok,=GQA+BB1-BPA1,=K2+BB1-KE1
3,=K1+UC1-WGB1,Ethernet Flexible Kabel,4,22.0,0.712,,=K1+UC1-KE1-X1,=K1+UC1-KEC1-X1
4,=K2+BB1-WDB1,ÖLFLEX® CLASSIC 110,2,1.0,10.0,,=K2+UC1-XDA1,=K2+BB1-KE1
5,=K2+BB1-WDB2,ÖLFLEX® CLASSIC 110,2,1.0,10.0,,=K2+BB1-KE1,=K2+BB1-KE2
6,=K2+BB1-WGB1,Ethernet Flexible Kabel,4,22.0,5.0,,=K1+UC1-KEC1-X6,=K2+BB1-KE1-X4
7,=K2+BB1-WGB2,Ethernet Flexible Kabel,4,22.0,4.0,,=K2+BB1-KE1-X5,=K2+BB1-KE2-X4
8,=F+UC1-WGB1,Ethernet Flexible Kabel,4,22.0,0.76,,=K1+UC1-KE1-X2,=F+UC1-KE1-KEC1-X1
9,=FQJ+UC1-WGA1,"C-M12F05-05X034PV10,0-MA-100184",5,34.0,10.0,,=FQJ+UC1-XGA1,=FQJ+BB1-BYB1


In [7]:
df = extract(doc[52:55], TableType.DEVICE_TAG_LIST)
df

Extracting 'TableType.DEVICE_TAG_LIST' from 3 pages...
Extracting 'TableType.DEVICE_TAG_LIST' from page #53
Extracting 'TableType.DEVICE_TAG_LIST' from page #54
Extracting 'TableType.DEVICE_TAG_LIST' from page #55


Unnamed: 0,Device tag,Quantity,Designation,Type number,Manufacturer,Part number
0,+A1-U3,1,Baying enclosure system VX25,VX.8886000,RIT,RIT.8886000
1,+A1-U4,1,"TS Cable duct for mounting plate, WHD: 60x2000...",TS.8800752,RIT,RIT.8800752
2,+A1-U5,1,"TS Cable duct for mounting plate, WHD: 60x2000...",TS.8800752,RIT,RIT.8800752
3,+A1-U6,1,"TS Cable duct for mounting plate, WHD: 60x2000...",TS.8800752,RIT,RIT.8800752
4,+A1-U7,1,"TS Cable duct for mounting plate, WHD: 30x2000...",TS.8800750,RIT,RIT.8800750
...,...,...,...,...,...,...
58,=TM2+A1-FC1,1,CIRCUIT-BREAKER SCREW CONNECTION 16A,3RV2011-4AA10,SIE,SIE.3RV2011-4AA10
59,=TM2+A1-TA1,1,Vector oriented drive inverter,MC07B0015-5A3-4-00,SEW,SEW.MC07B0015-5A3-4-00
60,=GL3+A1-FC1,1,CIRCUIT-BREAKER SPRING-L. CONN. 4A,3RV2011-1EA25,SIE,SIE.3RV2011-1EA25
61,=GL3+A1-QA1,1,,3RH2911-1FA22-0MA0 + 3RH2911-1FA22-0MA0,SIE,3RT2015-1BB41+1FA22


In [8]:
df = extract(doc[29:40], TableType.CONNECTION_LIST)
df

Extracting 'TableType.CONNECTION_LIST' from 11 pages...
Extracting 'TableType.CONNECTION_LIST' from page #30
Extracting 'TableType.CONNECTION_LIST' from page #31
Extracting 'TableType.CONNECTION_LIST' from page #32
Extracting 'TableType.CONNECTION_LIST' from page #33
Extracting 'TableType.CONNECTION_LIST' from page #34
Extracting 'TableType.CONNECTION_LIST' from page #35
Extracting 'TableType.CONNECTION_LIST' from page #36
Extracting 'TableType.CONNECTION_LIST' from page #37
Extracting 'TableType.CONNECTION_LIST' from page #38
Extracting 'TableType.CONNECTION_LIST' from page #39
Extracting 'TableType.CONNECTION_LIST' from page #40


Unnamed: 0,Connection,Target 1,Target 2,Color,Cross-section
0,,+A1-XD1:1,=GAA+A1-FC1:1,BK,6
1,,=GAA+A1-FC1:1,=EA+A1-FB1:1,OG,6
2,,=EA+A1-FB1:1,=EA+A1-FB3:1,OG,6
3,,=GAA+A1-FC1:2,=GAA+A1-FC2:1,BK,4
4,,=GAA+A1-FC2:1,=GAB1+A1-FC1:1/L1,BK,4
...,...,...,...,...,...
594,,+B3.X1-XZ1:9,=K1+B3.X1-KE1:3:2.1,GY,075
595,,+B3.X1-XZ1:10,=K1+B3.X1-KE1:3:2.2,GY,075
596,,+B3.X1-XZ1:11,=K1+B3.X1-KE1:3:2.3,GY,075
597,,+C2-XD1:1,=S2+C2-PH1:X1:X1.1,GY,15


Now german

In [9]:
doc_de = pymupdf.open("pdfs/energy_de.pdf")

df = extract(doc_de[279:290], TableType.DEVICE_LIST_DE)
df

Extracting 'TableType.DEVICE_LIST_DE' from 11 pages...
Extracting 'TableType.DEVICE_LIST_DE' from page #280
Extracting 'TableType.DEVICE_LIST_DE' from page #281
Extracting 'TableType.DEVICE_LIST_DE' from page #282
Extracting 'TableType.DEVICE_LIST_DE' from page #283
Extracting 'TableType.DEVICE_LIST_DE' from page #284
Extracting 'TableType.DEVICE_LIST_DE' from page #285
Extracting 'TableType.DEVICE_LIST_DE' from page #286
Extracting 'TableType.DEVICE_LIST_DE' from page #287
Extracting 'TableType.DEVICE_LIST_DE' from page #288
Extracting 'TableType.DEVICE_LIST_DE' from page #289
Extracting 'TableType.DEVICE_LIST_DE' from page #290


Unnamed: 0,Referenzindikator,ERP-Nummer,Betrag,Bezeichnung,Typnummer,Hersteller,Bestellnummer
0,WBB1,,10 m,Mittelspannungskabel N2XSY 6/10kV 1x35/16 (rm)...,"N2XSY 6/10kV, 12/20kV, 18/30kV",Helukabel,32400
1,WBB2,,10 m,Mittelspannungskabel N2XSY 6/10kV 1x35/16 (rm)...,"N2XSY 6/10kV, 12/20kV, 18/30kV",Helukabel,32400
2,WBB3,,10 m,Mittelspannungskabel N2XSY 6/10kV 1x35/16 (rm)...,"N2XSY 6/10kV, 12/20kV, 18/30kV",Helukabel,32400
3,WBB4,,1 m,Mittelspannungskabel N2XSY 6/10kV 1x35/16 (rm)...,"N2XSY 6/10kV, 12/20kV, 18/30kV",Helukabel,32400
4,WBB5,,1 m,Mittelspannungskabel N2XSY 6/10kV 1x35/16 (rm)...,"N2XSY 6/10kV, 12/20kV, 18/30kV",Helukabel,32400
...,...,...,...,...,...,...,...
300,=B2.HD1-SSJ1,,1\n1\n1,"Knebelschalter, O-I, schwarz, weiss\nSchildträ...",3SU1100-2BF60-1BA0\n3SU1900-0AS10-0AA0\n3SU190...,Siemens\nSiemens\nSiemens,3SU1100-2BF60-1BA0\n3SU1900-0AS10-0AA0\n3SU190...
301,=B2.HD1-XDA1,EES00003,1,Endhalter,CLIPFIX 35,Phoenix Contact,3022218
302,=B2.KL1-QBB1,,1\n3,"SV NH-Sicherungs-Lastschaltleiste, Gr. 1, 250 ...",SV.9677100\n3NA7132,Rittal\nSiemens,9677100\n3NA7132
303,=B2.KL2-QBB1,,1\n3,"SV NH-Sicherungs-Lastschaltleiste, Gr. 1, 250 ...",SV.9677100\n3NA7132,Rittal\nSiemens,9677100\n3NA7132


In [10]:
doc_de2 = pymupdf.open("pdfs/machine_building_de.pdf")

df = extract(doc_de2[137:144], TableType.DEVICE_LIST_DE)
df

Extracting 'TableType.DEVICE_LIST_DE' from 7 pages...
Extracting 'TableType.DEVICE_LIST_DE' from page #138
Extracting 'TableType.DEVICE_LIST_DE' from page #139
Extracting 'TableType.DEVICE_LIST_DE' from page #140
Extracting 'TableType.DEVICE_LIST_DE' from page #141
Extracting 'TableType.DEVICE_LIST_DE' from page #142
Extracting 'TableType.DEVICE_LIST_DE' from page #143
Extracting 'TableType.DEVICE_LIST_DE' from page #144


Unnamed: 0,Referenzindikator,ERP-Nummer,Betrag,Bezeichnung,Typnummer,Hersteller,Bestellnummer
0,+UC1-U1,,1,Anreih-Schranksystem VX25,VX.8806000,Rittal,8806000
1,+UC1-U2,,1,"Seitenwand, verschraubbar, Stahlblech",VX.8106245,Rittal,8106245
2,+UC1-U3,,1,"Seitenwand, verschraubbar, Stahlblech",VX.8106245,Rittal,8106245
3,+UC1-U8,,0.699 m,Kabelkanal,TS.8800753,Rittal,8800753
4,+UC1-U9,,0.699 m,Kabelkanal,TS.8800753,Rittal,8800753
...,...,...,...,...,...,...,...
167,=KEC+BB1.RA1-WGA1,,10 m,Anschluss- und Steuerleitungen,ÖLFLEX® CLASSIC 110,Lapp Group,1119025
168,=PHA+UC1-PHA1,,1,SIMATIC HMI TP900 COMFORT,6AV2124-0JC01-0AX0,Siemens,6AV2124-0JC01-0AX0
169,=PHA+UC1-PHA1-X1,,1,RJ45-Steckverbinder,CUC-IND-C1ZNI-S/R4IE8,Phoenix Contact,1421607
170,=PHA+UC1-WGB1,,2.135 m,TP Flexibles Kabel,6XV1870-2B,Siemens,6XV1870-2B


# Then I want to create some Internal Representation

In [None]:
import uuid
import json as js
import re
import pandas as pd
from typing import List, Dict, Optional, Any, Union, Tuple
from abc import ABC
from configs import LevelConfig, AspectsConfig
from tag import Tag
from dataclasses import dataclass


@dataclass
class Attribute:
    def __init__(self, name: str, value: Union[str, dict[str, str], int]) -> None:
        self.name: str = name
        self.value: Union[str, dict[str, str], int] = value

    def __repr__(self) -> str:
        return f"Attribute(name={self.name}, value={self.value})"


# class Prefix(Attribute):
#     def __init__(self, name: str, value: str) -> None:
#         super().__init__(name, value)

# class Special(Attribute):
#     def __init__(self, name: str, value: str) -> None:
#         super().__init__(name, value)


# class Aspect:
#     def __init__(self, prefix: str, priority: int, role: str) -> None:
#         self.prefix: str = prefix
#         self.priority: int = priority
#         self.role: str = role


class XTarget:
    def __init__(
        self, tag_str: str = "", attributes: Optional[List[Attribute]] = None
    ) -> None:
        self.tag: Tag = Tag(tag_str)
        self.attributes: List[Attribute] = attributes or []

    def add_attribute(self, attribute: Attribute) -> None:
        self.attributes.append(attribute)

    def get_attribute(self, attr_type: type, name: str) -> Optional[Attribute]:
        for attribute in self.attributes:
            if isinstance(attribute, attr_type) and attribute.name == name:
                return attribute
        return None

    def remove_attribute(self, attr: Attribute) -> None:
        self.attributes = [
            attribute for attribute in self.attributes if attribute is not attr
        ]

    def get_name(self, configs: AspectsConfig) -> str:
        tag_parts = self.tag.get_tag_parts(configs)
        if not tag_parts:
            return ""
        ordered_seps = configs.separators()
        ordered_seps = [sep for sep in ordered_seps if sep in tag_parts]
        new_tag_str = "".join([f"{sep}{tag_parts[sep]}" for sep in ordered_seps])
        return new_tag_str

    def get_unique_id(self) -> str:
        # Everytime we process the pdf -> generate the same ID for the same tag
        import hashlib

        return hashlib.md5(self.tag.tag_str.encode()).hexdigest()

    def __repr__(self) -> str:
        return f"Object(tag={self.tag}, attributes={self.attributes})"


class Connection(XTarget):
    def __init__(
        self,
        von: Optional["XTarget"] = None,
        bis: Optional["XTarget"] = None,
        tag_str="",
        attributes: Optional[List[Attribute]] = None,
    ) -> None:
        super().__init__(tag_str, attributes)
        self.von: Optional["XTarget"] = von
        self.bis: Optional["XTarget"] = bis

    def __repr__(self) -> str:
        return "Connection(von={}, bis={}, tag={}, attributes={})".format(
            self.von.__repr__(), self.bis.__repr__(), self.tag.tag_str, self.attributes
        )

    def get_name(self, configs: AspectsConfig) -> str:
        return 'Connection(von="{}", bis="{}", tag="{}", attributes={})'.format(
            self.von.get_name(configs) if self.von else "None",
            self.bis.get_name(configs) if self.bis else "None",
            self.tag.tag_str,
            self.attributes,
        )


class Manager:
    def __init__(self, configs: AspectsConfig) -> None:
        self.empty_object: XTarget = XTarget("")
        self.objects: List[XTarget] = [self.empty_object]
        self.connections: List[Connection] = []
        self.configs: AspectsConfig = configs
        self.obj_map: Dict[str, XTarget] = {}
        self.setup_configs()

    def setup_configs(self) -> None:
        # recompute lookup table
        if self.objects is not None:
            self.obj_map = {obj.get_name(self.configs): obj for obj in self.objects}

    def get_tag_list(self) -> List[str]:
        return sorted(self.obj_map.keys())

    def build_tree(self) -> Dict[str, Any]:
        # form tree of objects by aspects. Level of the tree is aspect priority
        tree: Dict[str, Any] = {}

        def insert_node(tree: Dict[str, Any], obj: XTarget) -> None:
            node: Dict[str, Any] = tree
            tag_parts = obj.tag.get_tag_parts(self.configs)

            for separator in self.configs.separators():
                if separator in tag_parts:
                    node = node.setdefault(separator, {})
                    node = node.setdefault(tag_parts[separator], {})
            node.setdefault("_objects", []).append(obj)

        for obj in self.objects:
            insert_node(tree, obj)
        return tree

    def object_from_tag(self, tag_str: str) -> XTarget:
        obj_name = XTarget(tag_str).get_name(self.configs)
        if obj_name in self.obj_map:
            return self.obj_map[obj_name]

        # if not - add to lookup and to objects
        obj: XTarget = XTarget(tag_str)

        # Store
        self.obj_map[obj_name] = obj
        self.objects.append(obj)
        return obj

    def read_objects(self, df: pd.DataFrame, tag_column: Optional[str] = None) -> None:
        # pick the column
        if tag_column is None:
            tag_column = df.columns[0]

        for _, row in df.iterrows():
            tag_str: str = str(row[tag_column]) if pd.notna(row[tag_column]) else ""
            if not tag_str.strip():
                continue

            obj: XTarget = self.object_from_tag(tag_str)

            # Add all other columns as regular Attributes
            for col in df.columns:
                if col == tag_column:
                    continue
                value: Any = row[col]
                if pd.notna(value) and str(value).strip() != "":
                    # Flatten attribute if contains '\n'
                    values: List[str] = str(value).split("\n")
                    joined_value: str = " <br/> ".join(
                        v.strip() for v in values if v.strip() != ""
                    )
                    if joined_value:
                        obj.add_attribute(Attribute(col, joined_value))

    def read_connections(
        self,
        df: pd.DataFrame,
        von_column: Optional[str] = None,
        bis_column: Optional[str] = None,
        this_column: Optional[str] = None,
    ) -> None:
        connections: List[Connection] = []

        # Default to first column if not provided
        if von_column is None:
            von_column = df.columns[1]
        if bis_column is None:
            bis_column = df.columns[2]
        if this_column is None:
            this_column = df.columns[0]

        for _, row in df.iterrows():
            von: Optional[XTarget] = None
            bis: Optional[XTarget] = None

            von_name: Optional[str] = (
                row[von_column] if pd.notna(row[von_column]) else None
            )
            bis_name: Optional[str] = (
                row[bis_column] if pd.notna(row[bis_column]) else None
            )
            this_name: Optional[str] = (
                row[this_column] if pd.notna(row[this_column]) else None
            )

            if von_name and von_name.strip():
                von = self.object_from_tag(von_name)
                von.add_attribute(
                    Attribute("link", f"to {bis_name} through {this_name}")
                )
            else:
                von = self.empty_object

            if bis_name and bis_name.strip():
                bis = self.object_from_tag(bis_name)
                bis.add_attribute(
                    Attribute("link", f"to {von_name} through {this_name}")
                )
            else:
                bis = self.empty_object

            # Collect any additional attributes from other columns
            attributes: List[Attribute] = []
            for col in df.columns:
                if col in (von_column, bis_column, this_column):
                    continue
                val: Any = row[col]
                if pd.notna(val) and str(val).strip() != "":
                    attributes.append(Attribute(col, str(val)))

            conn: Connection = Connection(
                von=von, bis=bis, tag_str=this_name or "", attributes=attributes
            )
            connections.append(conn)

        self.connections = connections

    def print_stats(self) -> None:
        print(f"Has {len(self.objects)} objects")

        for o in self.objects:
            print(f"Object: {o}")

        print(f"Has {len(self.connections)} connections")

        for c in self.connections:
            print(f"Connection: {c.get_name(self.configs)}")

In [None]:
# Create configs using the new architecture
from configs import AspectsConfig, LevelConfig
from typing import OrderedDict

configs = AspectsConfig(
    OrderedDict(
        {
            "=": LevelConfig(Order=1, Separator="=", Aspect="function"),
            "+": LevelConfig(Order=2, Separator="+", Aspect="location"),
            "-": LevelConfig(Order=3, Separator="-", Aspect="div"),
        }
    )
)

print("Configs separators:", list(configs.separators()))
print("Configs aspects:", configs.aspects())

Configs separators: ['=', '+', '-']
Configs aspects: ['function', 'location', 'div']


In [69]:
# Update to use Manager for reading objects
mngr = Manager(configs)
mngr.read_objects(df)

print(f"Extracted {len(mngr.objects)} objects")

for o in mngr.objects:
    name = o.get_name(configs)
    if name:  # Skip empty objects
        print(name)

No valid separators found in tag string: *


Failed to parse tag string: * with config: AspectsConfig(levels=OrderedDict({'=': LevelConfig(Order=1, Separator='=', Aspect='function'), '+': LevelConfig(Order=2, Separator='+', Aspect='location'), '-': LevelConfig(Order=3, Separator='-', Aspect='div')}))
No valid separators found in tag string: *
Failed to parse tag string: * with config: AspectsConfig(levels=OrderedDict({'=': LevelConfig(Order=1, Separator='=', Aspect='function'), '+': LevelConfig(Order=2, Separator='+', Aspect='location'), '-': LevelConfig(Order=3, Separator='-', Aspect='div')}))
No valid separators found in tag string: *
Failed to parse tag string: * with config: AspectsConfig(levels=OrderedDict({'=': LevelConfig(Order=1, Separator='=', Aspect='function'), '+': LevelConfig(Order=2, Separator='+', Aspect='location'), '-': LevelConfig(Order=3, Separator='-', Aspect='div')}))
No valid separators found in tag string: *
Failed to parse tag string: * with config: AspectsConfig(levels=OrderedDict({'=': LevelConfig(Order=

Extracted 1 objects


In [70]:
# Update to use Manager for reading connections
mngr = Manager(configs)
df = extract(doc[29:40], TableType.CONNECTION_LIST)

display(df)
mngr.read_connections(df)

print(f"Extracted {len(mngr.connections)} connections")

for c in mngr.connections:
    print(c.get_name(configs))

Extracting 'TableType.CONNECTION_LIST' from 11 pages...
Extracting 'TableType.CONNECTION_LIST' from page #30
Extracting 'TableType.CONNECTION_LIST' from page #31
Extracting 'TableType.CONNECTION_LIST' from page #32
Extracting 'TableType.CONNECTION_LIST' from page #33
Extracting 'TableType.CONNECTION_LIST' from page #34
Extracting 'TableType.CONNECTION_LIST' from page #35
Extracting 'TableType.CONNECTION_LIST' from page #36
Extracting 'TableType.CONNECTION_LIST' from page #37
Extracting 'TableType.CONNECTION_LIST' from page #38
Extracting 'TableType.CONNECTION_LIST' from page #39
Extracting 'TableType.CONNECTION_LIST' from page #40


Unnamed: 0,Connection,Target 1,Target 2,Color,Cross-section
0,,+A1-XD1:1,=GAA+A1-FC1:1,BK,6
1,,=GAA+A1-FC1:1,=EA+A1-FB1:1,OG,6
2,,=EA+A1-FB1:1,=EA+A1-FB3:1,OG,6
3,,=GAA+A1-FC1:2,=GAA+A1-FC2:1,BK,4
4,,=GAA+A1-FC2:1,=GAB1+A1-FC1:1/L1,BK,4
...,...,...,...,...,...
594,,+B3.X1-XZ1:9,=K1+B3.X1-KE1:3:2.1,GY,075
595,,+B3.X1-XZ1:10,=K1+B3.X1-KE1:3:2.2,GY,075
596,,+B3.X1-XZ1:11,=K1+B3.X1-KE1:3:2.3,GY,075
597,,+C2-XD1:1,=S2+C2-PH1:X1:X1.1,GY,15


Extracted 599 connections
Connection(von="+A1-XD1:1", bis="=GAA+A1-FC1:1", tag="", attributes=[Attribute(name=Color, value=BK), Attribute(name=Cross-section, value=6)])
Connection(von="=GAA+A1-FC1:1", bis="=EA+A1-FB1:1", tag="", attributes=[Attribute(name=Color, value=OG), Attribute(name=Cross-section, value=6)])
Connection(von="=EA+A1-FB1:1", bis="=EA+A1-FB3:1", tag="", attributes=[Attribute(name=Color, value=OG), Attribute(name=Cross-section, value=6)])
Connection(von="=GAA+A1-FC1:2", bis="=GAA+A1-FC2:1", tag="", attributes=[Attribute(name=Color, value=BK), Attribute(name=Cross-section, value=4)])
Connection(von="=GAA+A1-FC2:1", bis="=GAB1+A1-FC1:1/L1", tag="", attributes=[Attribute(name=Color, value=BK), Attribute(name=Cross-section, value=4)])
Connection(von="=GAB1+A1-FC1:1/L1", bis="=GAB2+A1-FC1:1/L1", tag="", attributes=[Attribute(name=Color, value=BK), Attribute(name=Cross-section, value=4)])
Connection(von="=GAB2+A1-FC1:1/L1", bis="=GL1+A1-FC1:1/L1", tag="", attributes=[Attribu

In [71]:
mngr = Manager(configs)
df_connections = extract(doc[29:40], TableType.CONNECTION_LIST)
df_objects = extract(doc[52:55], TableType.DEVICE_TAG_LIST)

display(df_connections)
mngr.read_connections(df_connections)
mngr.read_objects(df_objects)
mngr.print_stats()

Extracting 'TableType.CONNECTION_LIST' from 11 pages...
Extracting 'TableType.CONNECTION_LIST' from page #30
Extracting 'TableType.CONNECTION_LIST' from page #31
Extracting 'TableType.CONNECTION_LIST' from page #32
Extracting 'TableType.CONNECTION_LIST' from page #33
Extracting 'TableType.CONNECTION_LIST' from page #34
Extracting 'TableType.CONNECTION_LIST' from page #35
Extracting 'TableType.CONNECTION_LIST' from page #36
Extracting 'TableType.CONNECTION_LIST' from page #37
Extracting 'TableType.CONNECTION_LIST' from page #38
Extracting 'TableType.CONNECTION_LIST' from page #39
Extracting 'TableType.CONNECTION_LIST' from page #40
Extracting 'TableType.DEVICE_TAG_LIST' from 3 pages...
Extracting 'TableType.DEVICE_TAG_LIST' from page #53
Extracting 'TableType.DEVICE_TAG_LIST' from page #54
Extracting 'TableType.DEVICE_TAG_LIST' from page #55


Unnamed: 0,Connection,Target 1,Target 2,Color,Cross-section
0,,+A1-XD1:1,=GAA+A1-FC1:1,BK,6
1,,=GAA+A1-FC1:1,=EA+A1-FB1:1,OG,6
2,,=EA+A1-FB1:1,=EA+A1-FB3:1,OG,6
3,,=GAA+A1-FC1:2,=GAA+A1-FC2:1,BK,4
4,,=GAA+A1-FC2:1,=GAB1+A1-FC1:1/L1,BK,4
...,...,...,...,...,...
594,,+B3.X1-XZ1:9,=K1+B3.X1-KE1:3:2.1,GY,075
595,,+B3.X1-XZ1:10,=K1+B3.X1-KE1:3:2.2,GY,075
596,,+B3.X1-XZ1:11,=K1+B3.X1-KE1:3:2.3,GY,075
597,,+C2-XD1:1,=S2+C2-PH1:X1:X1.1,GY,15


Has 1022 objects
Object: Object(tag=Tag(tag_str='', attributes=[])
Object: Object(tag=Tag(tag_str='+A1-XD1:1', attributes=[Attribute(name=link, value=to =GAA+A1-FC1:1 through )])
Object: Object(tag=Tag(tag_str='=GAA+A1-FC1:1', attributes=[Attribute(name=link, value=to +A1-XD1:1 through ), Attribute(name=link, value=to =EA+A1-FB1:1 through )])
Object: Object(tag=Tag(tag_str='=EA+A1-FB1:1', attributes=[Attribute(name=link, value=to =GAA+A1-FC1:1 through ), Attribute(name=link, value=to =EA+A1-FB3:1 through )])
Object: Object(tag=Tag(tag_str='=EA+A1-FB3:1', attributes=[Attribute(name=link, value=to =EA+A1-FB1:1 through )])
Object: Object(tag=Tag(tag_str='=GAA+A1-FC1:2', attributes=[Attribute(name=link, value=to =GAA+A1-FC2:1 through )])
Object: Object(tag=Tag(tag_str='=GAA+A1-FC2:1', attributes=[Attribute(name=link, value=to =GAA+A1-FC1:2 through ), Attribute(name=link, value=to =GAB1+A1-FC1:1/L1 through )])
Object: Object(tag=Tag(tag_str='=GAB1+A1-FC1:1/L1', attributes=[Attribute(name=li

In [72]:
def print_tree(tree, indent=0):
    """
    Recursively print the hierarchical tree of objects.
    """
    for key, value in tree.items():
        if key == "_objects":
            for obj in value:
                name = obj.get_name(mngr.configs)
                if name:  # Skip empty names
                    print("    " * indent + f"<> {name}")
        else:
            print("    " * indent + f"{key}:")
            print_tree(value, indent + 1)


print_tree(mngr.build_tree())

+:
    A1:
        -:
            XD1:1:
                <> +A1-XD1:1
            XD1:2:
                <> +A1-XD1:2
            XD1:3:
                <> +A1-XD1:3
            XD5:6:
                <> +A1-XD5:6
            XD5:5:
                <> +A1-XD5:5
            XD5:4:
                <> +A1-XD5:4
            XD5:3:
                <> +A1-XD5:3
            XD5:2:
                <> +A1-XD5:2
            XD5:1:
                <> +A1-XD5:1
            WE2:1:
                <> +A1-WE2:1
            XD1:7:
                <> +A1-XD1:7
            WE1:1:
                <> +A1-WE1:1
            XD1:8:
                <> +A1-XD1:8
            WE2:9:
                <> +A1-WE2:9
            FC2:1:
                <> +A1-FC2:1
            XD1:4:
                <> +A1-XD1:4
            FC2:2:
                <> +A1-FC2:2
            XD1:5:
                <> +A1-XD1:5
            FC2:3:
                <> +A1-FC2:3
            XD1:6:
                <> +A1-XD1:6
            WE2:2:

In [73]:
import plotly.graph_objects as go
import networkx as nx


def build_graph(objects, connections, configs):
    """
    Build a networkx graph from objects + connections.
    Each object is a node; edges come from connections.
    """
    G = nx.DiGraph()

    # Add object nodes
    for obj in objects:
        node_id = obj.get_name(configs)
        if not node_id:  # Skip empty objects
            continue
        # Create description from attributes
        desc_parts = [f"{attr.name}: {attr.value}" for attr in obj.attributes]
        desc = "<br>".join(desc_parts) if desc_parts else "No attributes"
        G.add_node(node_id, label=node_id, description=desc)

    # Add connection edges
    for conn in connections:
        von_name = conn.von.get_name(configs) if conn.von else None
        bis_name = conn.bis.get_name(configs) if conn.bis else None
        edge_label = conn.tag.tag_str if conn.tag.tag_str else "connection"

        if von_name and bis_name and von_name != bis_name:
            # Create description from connection attributes
            desc_parts = [f"{attr.name}: {attr.value}" for attr in conn.attributes]
            desc = "<br>".join(desc_parts) if desc_parts else edge_label
            G.add_edge(von_name, bis_name, label=edge_label, description=desc)

    return G


def plot_graph(G):
    """
    Plot the graph interactively with Plotly.
    Hover shows node/edge attributes.
    """
    if len(G.nodes()) == 0:
        print("No nodes to display in graph")
        return None

    pos = nx.spring_layout(G, seed=42)

    # Extract edges
    edge_x = []
    edge_y = []
    edge_text = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(edge[2].get("description", ""))

    edge_trace = go.Scatter(
        x=edge_x,
        y=edge_y,
        line=dict(width=1, color="#888"),
        hoverinfo="text",
        mode="lines",
    )

    # Extract nodes
    node_x = []
    node_y = []
    node_text = []
    for node, data in G.nodes(data=True):
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_text.append(f"{data['label']}<br>{data['description']}")

    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode="markers+text",
        text=[n for n in G.nodes()],
        textposition="top center",
        hoverinfo="text",
        hovertext=node_text,
        marker=dict(showscale=False, color="skyblue", size=20, line_width=2),
    )

    fig = go.Figure(
        data=[edge_trace, node_trace],
        layout=go.Layout(
            title=dict(
                text="Interactive Object Tree with Connections", font=dict(size=16)
            ),
            showlegend=False,
            hovermode="closest",
            margin=dict(b=20, l=5, r=5, t=40),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        ),
    )

    return fig


# Build + plot
G = build_graph(mngr.objects, mngr.connections, mngr.configs)
fig = plot_graph(G)
if fig:
    fig.show()

In [74]:
df_objects = extract(doc[52:55], TableType.DEVICE_TAG_LIST)
df_connections = extract(doc[29:40], TableType.CONNECTION_LIST)

mngr = Manager(configs)
mngr.read_connections(df_connections)
mngr.read_objects(df_objects)
mngr.print_stats()

Extracting 'TableType.DEVICE_TAG_LIST' from 3 pages...
Extracting 'TableType.DEVICE_TAG_LIST' from page #53
Extracting 'TableType.DEVICE_TAG_LIST' from page #54
Extracting 'TableType.DEVICE_TAG_LIST' from page #55
Extracting 'TableType.CONNECTION_LIST' from 11 pages...
Extracting 'TableType.CONNECTION_LIST' from page #30
Extracting 'TableType.CONNECTION_LIST' from page #31
Extracting 'TableType.CONNECTION_LIST' from page #32
Extracting 'TableType.CONNECTION_LIST' from page #33
Extracting 'TableType.CONNECTION_LIST' from page #34
Extracting 'TableType.CONNECTION_LIST' from page #35
Extracting 'TableType.CONNECTION_LIST' from page #36
Extracting 'TableType.CONNECTION_LIST' from page #37
Extracting 'TableType.CONNECTION_LIST' from page #38
Extracting 'TableType.CONNECTION_LIST' from page #39
Extracting 'TableType.CONNECTION_LIST' from page #40
Has 1022 objects
Object: Object(tag=Tag(tag_str='', attributes=[])
Object: Object(tag=Tag(tag_str='+A1-XD1:1', attributes=[Attribute(name=link, val