# Imports

In [1]:
import sys
import os
import pathlib

In [2]:
# for graph building
from entity_graph.graph_extractor.entities_graph_extractor import EntitiesGraphExtractor
# for partial results/debug:
import pandas as pd
from entity_graph.graph_extractor.processing.extraction_table import get_all_matching_tables
from entity_graph.graph_extractor.processing.models import TableFromHeaderExtracionConfig
from entity_graph.models.entity_graph.identifier import Identifier
from entity_graph.models.entity_graph.table import Table

# Data load

In [3]:
# main extractor
extractor = EntitiesGraphExtractor()

In [4]:
# helper reference to entities dict
entities = extractor.entities_graph_manager.entities

In [5]:
# source files dir
test_data_path = "../data/example1/"

In [6]:
# Test data peak
pd.read_csv(test_data_path+"accessories_flat.csv").head()

Unnamed: 0,Accessory Name,Compatible Machine ID,Accessory Specification
0,Milk Frothing Pitcher 350ml,EX001,Stainless steel jug for manual milk frothing
1,Water Filter AquaClean,EX002,Reduces limescale; improves taste; fits AquaCl...
2,Coffee Bean Container Extension,EX003,Extra 250g capacity for coffee beans
3,LatteCrema Milk Container,EX001,Removable container for milk frothing; dishwas...
4,Stainless Steel Tamper 58mm,EX010,Heavy-duty tamper for professional espresso ex...


In [7]:
# extracing from pdf directly, data filtering based on table header
config = {
    "extraction_type": "table_from_header",
    "filename": test_data_path + "coffee_machines.pdf",
    "header": ['Manufacturer', 'Coffee Machine Name', 'Machine ID', 'Production Year', 'Machine Type', 'Power (W)', 'Pressure (bar)', 'Water Tank Capacity (L)', 'Additional Features'],
}

In [8]:
# pdf extractor debug
get_all_matching_tables(TableFromHeaderExtracionConfig(**config), debug=True)

extract 1


[[['Manufacturer',
   'Coffee Machine Name',
   'Machine ID',
   'Production Year',
   'Machine Type',
   'Power (W)',
   'Pressure (bar)',
   'Water Tank Capacity (L)',
   'Additional Features'],
  ["De'Longhi",
   'Magnifica S ECAM 22.110.B',
   'EX001',
   '2022',
   'Automatic',
   '1450',
   '15',
   '1.8',
   'Steel grinder, cappuccino system'],
  ['Philips',
   'LatteGo EP3246/70',
   'EX002',
   '2023',
   'Automatic',
   '1500',
   '15',
   '1.8',
   'LatteGo, touch screen'],
  ['Jura',
   'E8 Piano Black',
   'EX003',
   '2022',
   'Automatic',
   '1450',
   '15',
   '1.9',
   'P.E.P., automatic cleaning'],
  ['Saeco',
   'Xelsis SM7685/00',
   'EX004',
   '2021',
   'Automatic',
   '1500',
   '15',
   '1.7',
   '15 user profiles'],
  ['Krups',
   'Evidence EA893C',
   'EX005',
   '2022',
   'Automatic',
   '1450',
   '15',
   '2.3',
   'Bluetooth, milk frothing'],
  ['Melitta',
   'Barista TS Smart F850-102',
   'EX006',
   '2023',
   'Automatic',
   '1450',
   '15',
   '1.8

In [9]:
# Load table to graph
extractor.load_table_from_file(
    config,
    "coffee_machines.pdf", 
    "Machines", 
    "instances",
)

Extraction! From {'extraction_type': 'table_from_header', 'filename': '../data/example1/coffee_machines.pdf', 'header': ['Manufacturer', 'Coffee Machine Name', 'Machine ID', 'Production Year', 'Machine Type', 'Power (W)', 'Pressure (bar)', 'Water Tank Capacity (L)', 'Additional Features']}
doing config: data_type=<OutputDataType.DATAFRAME: 'dataframe'> filename='../data/example1/coffee_machines.pdf' extraction_type=<ExtractionType.TABLE_FROM_HEADER: 'table_from_header'> header=['Manufacturer', 'Coffee Machine Name', 'Machine ID', 'Production Year', 'Machine Type', 'Power (W)', 'Pressure (bar)', 'Water Tank Capacity (L)', 'Additional Features'] pages=None columns_to_split=None prefixes=None
extract 1


[32m2025-08-26 11:06:51.958[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 1 objects created 1 unique keys[0m


In [10]:
# Load table to graph
extractor.load_table_from_file(
    test_data_path+"accessories_flat.csv",
    "accessories.csv", 
    "Accessories", 
    "instances",
)

[32m2025-08-26 11:06:51.982[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 33 objects created 33 unique keys[0m


# Graph building

In [11]:
# define graph building steps
steps = {
    "identifiers": [
        # helper identifier objects, needs name, table name and columns, optionally reads all id values
        ("MachineID", "Machines", ["Machine ID"], False),
    ],
    "identifiers_links": [
        # adds existing identifier object to a new table, needs name, table name and columns, optionally reads all id values
        ("MachineID", "Accessories", ["Compatible Machine ID"], False),
    ],
    "instances_creation": [
        # defines sources of objects to put in the graph, needs identifier name and table, optionally creates hierarchy or uses columns different than the identifier
        ("MachineID", "Machines", False, None),
    ],
    "enrichments": [
        # enriches objects with data from other sources, needs new data source table name, object attribute name, new source attribute and attribute to use as id of new nodes
        ("Accessories", "Machine ID", "Compatible Machine ID", ["Accessory Name"]),
    ],
}

In [12]:
extractor.extract_entities_graph(steps)

[32m2025-08-26 11:06:52.025[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 50 objects created 50 unique keys[0m
[32m2025-08-26 11:06:52.030[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 80 objects created 80 unique keys[0m
[32m2025-08-26 11:06:52.033[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 80 objects created 80 unique keys[0m
[32m2025-08-26 11:06:52.036[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map, from 95 objects created 95 unique keys[0m
[32m2025-08-26 11:06:52.039[0m | [36mDEBUG   [0m | [36mentity_graph.graph_manager[0m:[36mrefresh_name_map[0m:[36m43[0m - [36mCreating collection name map

Name map has 95 entries in collection 'default'
Created 0 parent-child relationships in collection 'default'


<entity_graph.graph_extractor.entities_graph_extractor.EntitiesGraphExtractor at 0x7fef9cb8bbc0>

# Graph checks

In [13]:
# check identifiers
identifiers = [
    v for k,v in entities.items()     if isinstance(v, Identifier)
]
identifiers

[<entity_graph.models.entity_graph.identifier.Identifier at 0x7fef95a56b10>]

In [14]:
identifiers[0].data

{'values': {'EX001',
  'EX002',
  'EX003',
  'EX004',
  'EX005',
  'EX006',
  'EX007',
  'EX008',
  'EX009',
  'EX010',
  'EX011',
  'EX012',
  'EX013',
  'EX014',
  'EX015',
  'EX016',
  'EX017',
  'EX018',
  'EX019',
  'EX020',
  'EX021',
  'EX022',
  'EX023',
  'EX024',
  'EX025',
  'EX026',
  'EX027',
  'EX028',
  'EX029',
  'EX030'}}

In [15]:
# export objects
graph_export = extractor.export_graph()
# nodes ids
graph_export[0]

['EX001',
 'EX002',
 'EX003',
 'EX004',
 'EX005',
 'EX006',
 'EX007',
 'EX008',
 'EX009',
 'EX010',
 'EX011',
 'EX012',
 'EX013',
 'EX014',
 'EX015',
 'EX016',
 'EX017',
 'EX018',
 'EX019',
 'EX020',
 'EX021',
 'EX022',
 'EX023',
 'EX024',
 'EX025',
 'EX026',
 'EX027',
 'EX028',
 'EX029',
 'EX030',
 'EX010|Stainless Steel Tamper 58mm',
 'EX010|Professional Knock Box',
 'EX010|Barista Digital Scale',
 'EX004|Ceramic Burr Grinder Replacement',
 'EX008|WiFi Module Upgrade',
 'EX005|Coffee Grounds Container Large',
 'EX002|Water Filter AquaClean',
 'EX002|Travel Mug Stainless Steel 350ml',
 'EX001|Milk Frothing Pitcher 350ml',
 'EX001|LatteCrema Milk Container',
 'EX001|Descaling Kit Universal',
 'EX001|Cappuccino Art Stencil Set',
 'EX003|Coffee Bean Container Extension',
 'EX003|Cold Brew Adapter Kit',
 'EX007|Dual Cup Espresso Glass Set']

In [16]:
# edges
graph_export[1]

[('EX001', 'EX001|Milk Frothing Pitcher 350ml'),
 ('EX001', 'EX001|LatteCrema Milk Container'),
 ('EX001', 'EX001|Descaling Kit Universal'),
 ('EX001', 'EX001|Cappuccino Art Stencil Set'),
 ('EX002', 'EX002|Water Filter AquaClean'),
 ('EX002', 'EX002|Travel Mug Stainless Steel 350ml'),
 ('EX003', 'EX003|Coffee Bean Container Extension'),
 ('EX003', 'EX003|Cold Brew Adapter Kit'),
 ('EX004', 'EX004|Ceramic Burr Grinder Replacement'),
 ('EX005', 'EX005|Coffee Grounds Container Large'),
 ('EX007', 'EX007|Dual Cup Espresso Glass Set'),
 ('EX008', 'EX008|WiFi Module Upgrade'),
 ('EX010', 'EX010|Stainless Steel Tamper 58mm'),
 ('EX010', 'EX010|Professional Knock Box'),
 ('EX010', 'EX010|Barista Digital Scale')]

In [17]:
# nodes data
graph_export[2]

{'EX001': {'Manufacturer': "De'Longhi",
  'Coffee Machine Name': 'Magnifica S ECAM 22.110.B',
  'Machine ID': 'EX001',
  'Production Year': '2022',
  'Machine Type': 'Automatic',
  'Power (W)': '1450',
  'Pressure (bar)': '15',
  'Water Tank Capacity (L)': '1.8',
  'Additional Features': 'Steel grinder, cappuccino system',
  'collection': 'default'},
 'EX002': {'Manufacturer': 'Philips',
  'Coffee Machine Name': 'LatteGo EP3246/70',
  'Machine ID': 'EX002',
  'Production Year': '2023',
  'Machine Type': 'Automatic',
  'Power (W)': '1500',
  'Pressure (bar)': '15',
  'Water Tank Capacity (L)': '1.8',
  'Additional Features': 'LatteGo, touch screen',
  'collection': 'default'},
 'EX003': {'Manufacturer': 'Jura',
  'Coffee Machine Name': 'E8 Piano Black',
  'Machine ID': 'EX003',
  'Production Year': '2022',
  'Machine Type': 'Automatic',
  'Power (W)': '1450',
  'Pressure (bar)': '15',
  'Water Tank Capacity (L)': '1.9',
  'Additional Features': 'P.E.P., automatic cleaning',
  'collectio

In [18]:
# see node data
graph_export[2]['EX001|Milk Frothing Pitcher 350ml']

{'Accessory Name': 'Milk Frothing Pitcher 350ml',
 'Compatible Machine ID': 'EX001',
 'Accessory Specification': 'Stainless steel jug for manual milk frothing',
 'collection': 'default'}

In [19]:
# see relations in the manager class
extractor.entities_graph_manager.find_entity("EX001").relations

[('from_artifact', '184b1078-0e41-4df9-a6f8-e85ce23debd8', None),
 ('parent_of',
  '0e1ba85b-450d-439a-8a38-a113f1992314',
  {'confidence': 'medium'}),
 ('parent_of',
  'b327799f-a772-4fd8-9ac8-3244f13ece81',
  {'confidence': 'medium'}),
 ('parent_of',
  'c9cf8d4d-fcb2-402d-a005-a3a4246ba064',
  {'confidence': 'medium'}),
 ('parent_of',
  '9d2dd902-c4ad-4ea6-81cc-a10b39aea4b7',
  {'confidence': 'medium'})]

In [20]:
extractor.entities_graph_manager.find_entity("Machines_0").relations

[('in_table', 'e53fa410-b64c-4773-a666-d4ce2d8d4b95', None),
 ('from_artifact', '414611d6-ad9f-45e3-ab0f-425c0409c1bd', None)]

In [21]:
extractor.entities_graph_manager.get_named_entity_relations("EX001", "from_artifact")

[<entity_graph.models.entity_graph.artifact.Artifact at 0x7fefe0162330>]

# Tests

In [22]:
assert "Machines_0" in [
    e.name for e in
    extractor.entities_graph_manager.get_named_entity_relations("EX001", "from_artifact")
]

In [23]:
assert "Machines" in [
    e.name for e in
    extractor.entities_graph_manager.get_named_entity_relations("Machines_0", "in_table")
]