In [1]:
from pathlib import Path

import pandas as pd
import serde.csv
from rdflib import RDFS
from sm.dataset import Dataset
from sm.evaluation.cpa_cta_metrics import PrecisionRecallF1, cpa, cta
from sm.namespaces.utils import KGName
from tum.actors.entry import *
from tum.config import CRITICAL_MAAS_DIR
from tum.namespace import MNDRNamespace
from tum.sm.dsl.main import save_training_data
from sm.evaluation.sm_metrics import precision_recall_f1
import tum.sm.dsl.main as dsl_main

In [2]:
test_exs = Dataset(
    CRITICAL_MAAS_DIR / "ta2-table-understanding/data/training_set/minmod.zip"
).load()
kgns = MNDRNamespace.create()

version = 112
output = []
id_props = set([str(RDFS.label)])

In [3]:
def predict(test_ex):
    train_ds = f"dsl-exclude-{test_ex.id}"
    train_exs = [ex for ex in test_exs if ex.id != test_ex.id]

    save_training_data(train_ds, train_exs, kgns)

    minmod_actor = G.create_actor(
        MinmodTableTransformationActor,
        [
            DBActorArgs(
                kgdbs=[
                    KGDBArgs(
                        name=KGName.Generic,
                        version="20231130",
                        datadir=CRITICAL_MAAS_DIR / "data/minmod/databases",
                    )
                ]
            ),
            DataActorArgs(skip_unk_ont_ent=True, skip_no_sm=True),
            MinmodGraphGenerationActorArgs(
                # model="logistic-regression",
                model="random-forest-100",
                train_dsquery=train_ds,
                top_n_stypes=2,
            ),
            MinmodGraphInferenceActorArgs(),
        ],
    )
    graphinf_actor = minmod_actor.graphinfer_actor

    sm = minmod_actor.graphinfer_actor(test_ex.table)
    assert len(test_ex.sms) == 1

    evalres = precision_recall_f1(test_ex.sms[0], sm)

    print("Performance of", test_ex.id)
    print(evalres)

    return (minmod_actor, sm, train_exs, evalres)

In [4]:
out = [predict(test_ex) for test_ex in test_exs]

[32m2024-09-10 14:04:33.537[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:33.537[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:33.538[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:33.581[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/000[0m
[32m2024-09-10 14:04:33.714[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m102[0m - [1mInitialize ray with args: {'log_to_driver': False}[0m
2024-09-10 14:04:34,674	INFO worker.py:1781 -- St

Compute similarity matrix:   0%|          | 0/61 [00:00<?, ?it/s]

[32m2024-09-10 14:04:40.722[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:40.746[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m
[32m2024-09-10 14:04:40.893[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 13 nodes and 23 edges[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1320
           1       1.00      0.95      0.97        60

    accuracy                           1.00      1380
   macro avg       1.00      0.97      0.99      1380
weighted avg       1.00      1.00      1.00      1380

Performance of CMMI
SmPrecisionRecallF1Output(precision=0.6666666666666666, recall=0.6666666666666666, f1=0.6666666666666666, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x332367f40>, n_corrects=8.0, n_examples=12, n_predictions=12, gold_triples={NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/row_index', target_id=2), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=11), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/resource_tonnage', target_id=10), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', 

[32m2024-09-10 14:04:40.942[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:40.943[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:40.943[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:40.985[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/001[0m


Compute similarity matrix:   0%|          | 0/64 [00:00<?, ?it/s]

[32m2024-09-10 14:04:42.510[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:42.532[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1386
           1       0.97      0.97      0.97        63

    accuracy                           1.00      1449
   macro avg       0.98      0.98      0.98      1449
weighted avg       1.00      1.00      1.00      1449



[32m2024-09-10 14:04:42.965[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 10 nodes and 17 edges[0m
[32m2024-09-10 14:04:42.994[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:42.994[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:42.995[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:43.034[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/002[0m


Performance of Mudd-and-Jowitt-2017-Zinc
SmPrecisionRecallF1Output(precision=1.0, recall=1.0, f1=1.0, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b57de1a0>, n_corrects=9.0, n_examples=9, n_predictions=9, gold_triples={NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/resource_tonnage', target_id=7), NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=6), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=5), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/country', target_id=4), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/grade', target_id=9), NodeTriple(source_id=0, link_label='https://

Compute similarity matrix:   0%|          | 0/60 [00:00<?, ?it/s]

[32m2024-09-10 14:04:44.374[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:44.393[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1180
           1       1.00      0.97      0.98        59

    accuracy                           1.00      1239
   macro avg       1.00      0.98      0.99      1239
weighted avg       1.00      1.00      1.00      1239



[32m2024-09-10 14:04:45.989[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 14 nodes and 25 edges[0m
[32m2024-09-10 14:04:46.032[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:46.033[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:46.033[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:46.072[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/003[0m


Performance of Mudd-and-Jowitt-2018-Copper
SmPrecisionRecallF1Output(precision=0.5384615384615384, recall=0.5384615384615384, f1=0.5384615384615384, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b5ca5f60>, n_corrects=7.0, n_examples=13, n_predictions=13, gold_triples={NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=6), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/resource_commodity', target_id=10), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/reserve_tonnage', target_id=9), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=5), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/countr

Compute similarity matrix:   0%|          | 0/61 [00:00<?, ?it/s]

[32m2024-09-10 14:04:47.652[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:47.671[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1320
           1       0.95      0.95      0.95        60

    accuracy                           1.00      1380
   macro avg       0.97      0.97      0.97      1380
weighted avg       1.00      1.00      1.00      1380



[32m2024-09-10 14:04:48.950[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 17 nodes and 31 edges[0m
[32m2024-09-10 14:04:49.009[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:49.009[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:49.009[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:49.057[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/004[0m


Performance of Mudd-and-Jowitt-2022-Nickel
SmPrecisionRecallF1Output(precision=0.5625, recall=0.75, f1=0.6428571428571429, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b63983a0>, n_corrects=9.0, n_examples=12, n_predictions=16, gold_triples={NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/grade', target_id=16), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=12), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=14), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/latitude', target_id=4), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/tonnage', target_id=15), NodeTriple(source_id=0, link

Compute similarity matrix:   0%|          | 0/64 [00:00<?, ?it/s]

[32m2024-09-10 14:04:50.471[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:50.491[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m
[32m2024-09-10 14:04:50.610[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 10 nodes and 17 edges[0m
[32m2024-09-10 14:04:50.648[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:50.648[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:50.649[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1386
           1       1.00      0.98      0.99        63

    accuracy                           1.00      1449
   macro avg       1.00      0.99      1.00      1449
weighted avg       1.00      1.00      1.00      1449

Performance of Nevada_Lithium_Resources_and_Reserves_March_2024_update
SmPrecisionRecallF1Output(precision=0.7777777777777778, recall=0.7777777777777778, f1=0.7777777777777778, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b601bd00>, n_corrects=7.0, n_examples=9, n_predictions=9, gold_triples={NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/reserve_tonnage', target_id=6), NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=2), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=1, link_label='https://min

[32m2024-09-10 14:04:50.692[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/005[0m


Compute similarity matrix:   0%|          | 0/64 [00:00<?, ?it/s]

[32m2024-09-10 14:04:52.328[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:52.348[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1386
           1       1.00      0.95      0.98        63

    accuracy                           1.00      1449
   macro avg       1.00      0.98      0.99      1449
weighted avg       1.00      1.00      1.00      1449



[32m2024-09-10 14:04:52.564[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 10 nodes and 17 edges[0m
[32m2024-09-10 14:04:52.608[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 14:04:52.609[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 14:04:52.609[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 14:04:52.649[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v113/006[0m


Performance of Weng_et_al_2015_REE
SmPrecisionRecallF1Output(precision=0.8888888888888888, recall=0.8888888888888888, f1=0.8888888888888888, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b6218b80>, n_corrects=8.0, n_examples=9, n_predictions=9, gold_triples={NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/row_index', target_id=2), NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/resource_tonnage', target_id=7), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=6), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/grade', target_id=9), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/state_or_province', target_id=4)

Compute similarity matrix:   0%|          | 0/58 [00:00<?, ?it/s]

[32m2024-09-10 14:04:53.954[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 14:04:53.972[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1083
           1       0.89      0.96      0.92        57

    accuracy                           0.99      1140
   macro avg       0.94      0.98      0.96      1140
weighted avg       0.99      0.99      0.99      1140



[32m2024-09-10 14:04:54.903[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 16 nodes and 29 edges[0m


Performance of World_Tin_Tungsten_Deposit
SmPrecisionRecallF1Output(precision=0.6, recall=0.6, f1=0.6, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x3b6398970>, n_corrects=9.0, n_examples=15, n_predictions=15, gold_triples={NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/cutoff_grade', target_id=13), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/record_id', target_id=2), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/category', target_id=9), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/country', target_id=4), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/date', target_id=15), NodeTriple(source_id=1, link_label='https://minmod.

In [6]:
print(PrecisionRecallF1.avg([x[-1] for x in out]))

PrecisionRecallF1(precision=0.7191849816849816, recall=0.7459706959706959, f1=0.7306645735217163)


In [10]:
from sm.evaluation.sm_metrics import NodeTriple

In [25]:
ytrue = []
ypred = []

for item in out:
    x = item[-1]

    pred = {
        NodeTriple(
            x.bijection.prime2x[triple.source_id],
            triple.link_label,
            x.bijection.prime2x[triple.target_id],
        )
        for triple in x.pred_triples
    }
    gold = x.gold_triples

    pred = {(t.source_id, t.target_id): t.link_label for t in pred}
    gold = {(t.source_id, t.target_id): t.link_label for t in gold}

    for t, l in gold.items():
        if t in pred:
            ytrue.append(l)
            ypred.append(pred[t])
    for t, l in pred.items():
        if t not in gold:
            ytrue.append("")
            ypred.append(l)

In [22]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [31]:
{triple.link_label for x in out for triple in x[-1].gold_triples}

{'http://www.w3.org/2000/01/rdf-schema#label',
 'https://minmod.isi.edu/ontology-simple/category',
 'https://minmod.isi.edu/ontology-simple/commodity',
 'https://minmod.isi.edu/ontology-simple/country',
 'https://minmod.isi.edu/ontology-simple/cutoff_grade',
 'https://minmod.isi.edu/ontology-simple/cutoff_grade_unit',
 'https://minmod.isi.edu/ontology-simple/date',
 'https://minmod.isi.edu/ontology-simple/deposit_type',
 'https://minmod.isi.edu/ontology-simple/grade',
 'https://minmod.isi.edu/ontology-simple/grade_unit',
 'https://minmod.isi.edu/ontology-simple/latitude',
 'https://minmod.isi.edu/ontology-simple/longitude',
 'https://minmod.isi.edu/ontology-simple/mineral_inventory',
 'https://minmod.isi.edu/ontology-simple/record_id',
 'https://minmod.isi.edu/ontology-simple/reserve_commodity',
 'https://minmod.isi.edu/ontology-simple/reserve_grade',
 'https://minmod.isi.edu/ontology-simple/reserve_tonnage',
 'https://minmod.isi.edu/ontology-simple/resource_commodity',
 'https://minmo

In [29]:
[x for x in ytrue if x.endswith("record_id")]

[]

In [27]:
print(classification_report(ytrue, ypred, digits=4))

                                                           precision    recall  f1-score   support

                                                              0.0000    0.0000    0.0000        14
               http://www.w3.org/2000/01/rdf-schema#label     0.7000    1.0000    0.8235         7
          https://minmod.isi.edu/ontology-simple/category     1.0000    1.0000    1.0000         2
         https://minmod.isi.edu/ontology-simple/commodity     0.5000    1.0000    0.6667         5
           https://minmod.isi.edu/ontology-simple/country     0.8571    1.0000    0.9231         6
      https://minmod.isi.edu/ontology-simple/cutoff_grade     0.6667    1.0000    0.8000         2
 https://minmod.isi.edu/ontology-simple/cutoff_grade_unit     0.0000    0.0000    0.0000         0
              https://minmod.isi.edu/ontology-simple/date     1.0000    0.5000    0.6667         2
      https://minmod.isi.edu/ontology-simple/deposit_type     1.0000    0.9091    0.9524        11
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
out[0][-1].pred_triples

{NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=4),
 NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/country', target_id=5),
 NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/deposit_type', target_id=7),
 NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/mineral_inventory', target_id=1),
 NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/row_index', target_id=2),
 NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=11),
 NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=3),
 NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=6),
 NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/grade', target_id=12),
 NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/grade

In [4]:
test_ex = test_exs[-1]

In [5]:
actor, sm, train_exs = predict(test_ex)

[32m2024-09-10 12:17:49.557[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m274[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-09-10 12:17:49.558[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m276[0m - [34m[1mInitializing argument parser...[0m
[32m2024-09-10 12:17:49.558[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m296[0m - [34m[1mConstructing the actor...[0m
[32m2024-09-10 12:17:49.604[0m | [34m[1mDEBUG   [0m | [36mMinmodGraphGenerationActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /Users/rook/workspace/darpa-criticalmaas/data/ream/MinmodGraphGenerationActor/v112/013[0m
[32m2024-09-10 12:17:49.734[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m102[0m - [1mInitialize ray with args: {'log_to_driver': False}[0m
2024-09-10 12:17:50,791	INFO worker.py:1781 -- St

Compute similarity matrix:   0%|          | 0/58 [00:00<?, ?it/s]

[32m2024-09-10 12:17:56.795[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m135[0m - [34m[1mPerformance:[0m
[32m2024-09-10 12:17:56.817[0m | [34m[1mDEBUG   [0m | [36mdsl.dsl[0m:[36mtrain_model[0m:[36m149[0m - [34m[1mSave model...[0m


              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1083
           1       0.89      0.96      0.92        57

    accuracy                           0.99      1140
   macro avg       0.94      0.98      0.96      1140
weighted avg       0.99      0.99      0.99      1140



[32m2024-09-10 12:17:57.759[0m | [1mINFO    [0m | [36mtum.sm.dsl.main[0m:[36mgen_can_graph[0m:[36m133[0m - [1mCandidate Graph with: 16 nodes and 29 edges[0m


Performance of World_Tin_Tungsten_Deposit
SmPrecisionRecallF1Output(precision=0.6, recall=0.6, f1=0.6, bijection=<sm.evaluation.sm_metrics.PartialBijection object at 0x336d739d0>, n_corrects=9.0, n_examples=15, n_predictions=15, gold_triples={NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/category', target_id=9), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/latitude', target_id=6), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/commodity', target_id=10), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/cutoff_grade_unit', target_id=14), NodeTriple(source_id=0, link_label='https://minmod.isi.edu/ontology-simple/longitude', target_id=5), NodeTriple(source_id=1, link_label='https://minmod.isi.edu/ontology-simple/cutoff_grade', target_id=13), NodeTriple(source_id=0, link_label='http://www.w3.org/2000/01/rdf-schema#label', target_id=3), NodeTriple(source_id=1, link_label='https://

In [7]:
sm.print(env="notebook")

HTML(value='<pre>\n00.\t<span style="background: #b7eb8f; color: black; padding: 2px; border-radius: 3px;">[0]…

In [8]:
dsl = actor.graphinfer_actor.cangraph_actor.get_dsl()

In [9]:
dsl_main.get_semantic_types(dsl, test_ex.table.table, top_n=2)

[[SemanticTypePrediction(stype=SType(mos:MineralInventory--mos:resource_grade), score=0.14926494223508927, col_index=0, col_name='deposit_no'),
  SemanticTypePrediction(stype=SType(mos:MineralInventory--mos:reserve_grade), score=0.07508409202491795, col_index=0, col_name='deposit_no')],
 [SemanticTypePrediction(stype=SType(mos:MineralSite--rdfs:label), score=0.8681363636363638, col_index=1, col_name='deposit_name'),
  SemanticTypePrediction(stype=SType(mos:MineralInventory--mos:category), score=0.16024576682619857, col_index=1, col_name='deposit_name')],
 [SemanticTypePrediction(stype=SType(mos:MineralSite--mos:country), score=0.89, col_index=2, col_name='country'),
  SemanticTypePrediction(stype=SType(mos:MineralSite--mos:deposit_type), score=0.16024576682619857, col_index=2, col_name='country')],
 [SemanticTypePrediction(stype=SType(mos:MineralSite--mos:longitude), score=0.16024576682619857, col_index=3, col_name='longdd'),
  SemanticTypePrediction(stype=SType(mos:MineralInventory--m

In [10]:
from sm.dataset import Example

ex = Example(
    id=test_ex.id,
    table=dsl_main.DSLTable.from_column_based_table(test_ex.table.table),
    sms=[],
)

stypes = dsl([ex], top_n=1)[0]

In [11]:
from dsl.dsl import DSLConfig, sample_table_data

In [12]:
cfg = DSLConfig.get_instance()
columns = ex.table.columns
simmatrix = dsl.stype_db.get_similarity_matrix(columns, verbose=True)

Compute similarity matrix:   0%|          | 0/14 [00:00<?, ?it/s]

In [13]:
ex.table.columns[6]

DSLColumn(id='World_Tin_Tungsten_Deposit:6:tonnage_unit', table_id='World_Tin_Tungsten_Deposit', col_index=6, col_name='tonnage_unit', type=<ColumnType.STRING: 'string'>, type_stats={<ColumnType.NUMBER: 'number'>: 0.0, <ColumnType.STRING: 'string'>: 0.7048237940514871, <ColumnType.DATETIME: 'datetime'>: 0.0, <ColumnType.NULL: 'null'>: 0.29517620594851285}, size=4001, num_array=[], num_idx_array=[], str_array=['metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'million metric tons', 'metric tons', 'metric tons', 'million metric tons', 'million metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'million metric tons', 'million metric tons', 'million metric tons', 'metric tons', 'metric tons', 'million metri

In [15]:
i = 10
print(ex.table.columns[i].col_name)
dsl.pred_type(i, ex.table.columns[i].col_name, 20, simmatrix)

grade_unit


[DSLPrediction(semantic_type=SType(mos:MineralInventory--mos:commodity), score=0.36309720373599447),
 DSLPrediction(semantic_type=SType(mos:MineralSite--mos:deposit_type), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralSite--mos:record_id), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralSite--mos:state_or_province), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralInventory--mos:category), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralInventory--mos:reserve_commodity), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralSite--rdfs:label), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralSite--mos:country), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralInventory--mos:resource_commodity), score=0.07631666664191615),
 DSLPrediction(semantic_type=SType(mos:MineralInventory--mos:grade), score=0.01),
 DSLPredicti

In [32]:
def find_train_cols(cls, pred):
    out = []
    for i in range(len(dsl.stype_db.train_column_stypes)):
        stype = dsl.stype_db.train_column_stypes[i].type
        if stype.class_rel_uri == cls and stype.predicate_rel_uri == pred:
            out.append(
                {
                    "col": i,
                    "name": [c.col_name for c in dsl.stype_db.train_columns[i].cols],
                }
            )
    return pd.DataFrame(out)

In [40]:
pd.set_option("display.max_colwidth", None)

In [41]:
traincols = find_train_cols("mos:MineralSite", "mos:deposit_type")
traincols = find_train_cols("mos:MineralInventory", "mos:resource_tonnage")
traincols

Unnamed: 0,col,name
0,2,"[Total Mineral Resource Mt (Tonnes of Million), Mt Ore, Mt ore, Mt Min. ResC, Resource Tonnage (Mt)]"


In [15]:
traincols = find_train_cols("mos:MineralSite", "mos:record_id")
traincols

Unnamed: 0,col,name
0,4,[DEPOSIT_UID]


In [44]:
dsl.stype_db.train_columns[2].cols[0]

DSLColumn(id='::Weng_et_al_2015_REE::--5:0:Total Mineral Resource Mt (Tonnes of Million)', table_id='::Weng_et_al_2015_REE::--5', col_index=0, col_name='Total Mineral Resource Mt (Tonnes of Million)', type=<ColumnType.STRING: 'string'>, type_stats={<ColumnType.NUMBER: 'number'>: 0.0, <ColumnType.STRING: 'string'>: 1.0, <ColumnType.DATETIME: 'datetime'>: 0.0, <ColumnType.NULL: 'null'>: 0.0}, size=3, num_array=[], num_idx_array=[], str_array=['::Mudd-and-Jowitt-2018-Copper::\nMt Min. ResC\n\n2203.556\n0.485\n2.95\n64.28571429\n3073.0\n64.2\n1160.0\n1.0\n13.41\n6.0\n4.34\n6.154\n0.515\n12.92\n1.49\n1.484422\n1786.376364\n44.5\n3.813\n1910.0\n201.81\n8.763\n29.4\n11.87\n5.0\n4.006\n1257.4\n1.569036874\n15.2\n965.0\n0.658\n16.4\n10.0\n2.855\n76.4\n5446.143\n1333.333333\n0.523849\n54.57\n25.4\n1160.0\n0.05\n3.091\n375.0\n1786.376364\n81.5\n202.138\n42.0\n637.0\n29.19\n1.075\n23.26\n2.57\n0.215534\n76.4\n448.0\n48.038\n11.6\n1256.992\n175.7\n352.6\n0.57\n0.03\n19.390243902438996\n1008.0\n0.39

In [37]:
ex.table.columns[6]

DSLColumn(id='World_Tin_Tungsten_Deposit:6:tonnage_unit', table_id='World_Tin_Tungsten_Deposit', col_index=6, col_name='tonnage_unit', type=<ColumnType.STRING: 'string'>, type_stats={<ColumnType.NUMBER: 'number'>: 0.0, <ColumnType.STRING: 'string'>: 0.7048237940514871, <ColumnType.DATETIME: 'datetime'>: 0.0, <ColumnType.NULL: 'null'>: 0.29517620594851285}, size=4001, num_array=[], num_idx_array=[], str_array=['metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'million metric tons', 'metric tons', 'metric tons', 'million metric tons', 'million metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'metric tons', 'million metric tons', 'million metric tons', 'million metric tons', 'metric tons', 'metric tons', 'million metri

In [39]:
traincols.col.to_list()

[2]

In [36]:
simmatrix[6][traincols.col.to_list()]

array([[0.25      , 0.16666667, 0.        , 0.        , 0.        ,
        0.        , 0.01903504, 1.        , 0.70482379]])

In [17]:
simmatrix[0][traincols.col.to_list()]

array([[0.33333333, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        ]])

In [18]:
dsl.get_model().predict_proba(simmatrix[1][traincols.col.to_list()])

array([[0.97865656, 0.02134344]])

In [19]:
sm.print()


00.	[0] mos:MineralSite
01.	├── ─[2: rdfs:label]→ [3] deposit_name (column 1)
02.	├── ─[3: mos:country]→ [4] country (column 2)
03.	├── ─[0: mos:mineral_inventory]→ [1] mos:MineralInventory
04.	│   ├── ─[8: mos:category]→ [9] category (column 7)
05.	│   ├── ─[9: mos:commodity]→ [10] commodity (column 8)
06.	│   ├── ─[12: mos:cutoff_grade]→ [13] cutoff_grade (column 11)
07.	│   ├── ─[11: mos:grade]→ [12] grade_unit (column 10)
08.	│   ├── ─[13: mos:grade]→ [14] cutoff_grade_unit (column 12)
09.	│   ├── ─[6: mos:reserve_tonnage]→ [7] tonnage (column 5)
10.	│   ├── ─[10: mos:resource_grade]→ [11] grade (column 9)
11.	│   ├── ─[7: mos:resource_tonnage]→ [8] tonnage_unit (column 6)
12.	├── ─[1: mos:record_id]→ [2] deposit_no (column 0)
13.	├── ─[4: mos:record_id]→ [5] longdd (column 3)
14.	├── ─[5: mos:record_id]→ [6] latdd (column 4)
15.	├── ─[14: mos:record_id]→ [15] year (column 13)



In [20]:
test_ex.sms[0].print()


00.	[0] Mineral Site
01.	├── ─[2: label]→ [3] deposit_name (column 1)
02.	├── ─[3: country]→ [4] country (column 2)
03.	├── ─[5: latitude]→ [6] latdd (column 4)
04.	├── ─[4: longitude]→ [5] longdd (column 3)
05.	├── ─[0: mineral inventory]→ [1] Mineral Inventory
06.	│   ├── ─[8: category]→ [9] category (column 7)
07.	│   ├── ─[9: commodity]→ [10] commodity (column 8)
08.	│   ├── ─[12: cutoff grade]→ [13] cutoff_grade (column 11)
09.	│   ├── ─[13: cutoff grade unit]→ [14] cutoff_grade_unit (column 12)
10.	│   ├── ─[14: date]→ [15] year (column 13)
11.	│   ├── ─[10: grade]→ [11] grade (column 9)
12.	│   ├── ─[11: grade unit]→ [12] grade_unit (column 10)
13.	│   ├── ─[6: tonnage]→ [7] tonnage (column 5)
14.	│   ├── ─[7: tonnage unit]→ [8] tonnage_unit (column 6)
15.	├── ─[1: record id]→ [2] deposit_no (column 0)



In [21]:
def pred_type(
    self,
    target_col_index: int,
    target_col_id: str,
    top_n: int,
    similarity_matrix,
):
    X = []
    refcols = [
        refcol for refcol in self.stype_db.train_columns if refcol.id != target_col_id
    ]
    for refcol in refcols:
        iref = self.stype_db.col2idx[refcol.id]
        X.append(similarity_matrix[target_col_index, iref])

    result = self.get_model().predict_proba(X)[:, 1]
    result = sorted(
        zip(result, (rc.id for rc in refcols)),
        key=lambda x: x[0],
        reverse=True,
    )

    print(result)

In [22]:
pred_type(dsl, 1, ex.table.columns[1].col_name, 20, simmatrix)

AttributeError: 'DSLColumnGroup' object has no attribute 'id'

In [None]:
ex.table.columns[1]

In [None]:
dsl.stype_db.train_columns[5]

In [None]:
dsl.stype_db.col2idx[
    "::Nevada_Lithium_Resources_and_Reserves_March_2024_update::--10:0:Type"
]

In [None]:
dsl.stype_db.col2idx[
    "::Nevada_Lithium_Resources_and_Reserves_March_2024_update::--2:0:Reserve Grade (ppm)"
]

In [None]:
train_exs[4].table.table.columns[5].values

In [None]:
[c.name for c in train_exs[4].table.table.columns]