In [1]:
import os, sys
sys.path.insert(0, './../')

In [2]:
import pandas as pd
from src.parsers.t5parser import T5Parser
from src.utils.ted_utils import StatementTree, TSS
# from utils.tree_utils import T/
parser = T5Parser('ud2sd_table')

# Example Statement Trees

In [3]:
example_prediction = ['| subject | property | property_value | subject_value | \n |-----------|-------------------|------------------|-----------------| \n | | scope 1 emissions | 5,558 | | \n | | unit | mt | | \n | | other | 2021 | | ',
 ' | subject | property | property_value | subject_value | \n |-----------|-------------------|------------------|-----------------| \n | | scope 2 emissions | 41,757 | | \n | | unit | mt | | \n | | other | 2021 | |']

In [4]:
# Let's look at one of the statement
df = parser.convert_markdown_to_dataframe(example_prediction[0])
df

Unnamed: 0,subject,property,property_value,subject_value
0,,scope 1 emissions,5558,
1,,unit,mt,
2,,other,2021,


In [5]:
# Now we get all the statements from the example prediction
dfs = [parser.convert_markdown_to_dataframe(item) for item in example_prediction]
s = StatementTree(dfs)
s.print_tree()

Node('/root', type='root', value=None)
├── Node('/root/s0', type='statement', value=None)
│   ├── Node('/root/s0/p0', type='predicate', value=None)
│   │   ├── Node('/root/s0/p0/subject', type='subject', value=nan)
│   │   ├── Node('/root/s0/p0/property', type='property', value='scope 1 emissions')
│   │   ├── Node('/root/s0/p0/property_value', type='property_value', value='5,558')
│   │   └── Node('/root/s0/p0/subject_value', type='subject_value', value=nan)
│   ├── Node('/root/s0/p1', type='predicate', value=None)
│   │   ├── Node('/root/s0/p1/subject', type='subject', value=nan)
│   │   ├── Node('/root/s0/p1/property', type='property', value='unit')
│   │   ├── Node('/root/s0/p1/property_value', type='property_value', value='mt')
│   │   └── Node('/root/s0/p1/subject_value', type='subject_value', value=nan)
│   └── Node('/root/s0/p2', type='predicate', value=None)
│       ├── Node('/root/s0/p2/subject', type='subject', value=nan)
│       ├── Node('/root/s0/p2/property', type='proper

# Example Tree Similarity Score

In [6]:
# let's calculate the TSS of S (from above) with itself. We expect it to be 1.
# for this we will pass the root node of the tree as t1 and as t2
# the root node is:
s.root

Node('/root', type='root', value=None)

In [7]:
tss = TSS(node_t1=s.root,node_t2=s.root,include_subjects=True )
tss.get_tree_similarity()

1

In [8]:
# consider the children from the root node:
s.root.children

(Node('/root/s0', type='statement', value=None),
 Node('/root/s1', type='statement', value=None))

In [9]:
# let's compute the TSS between these two children trees. For this we will pass the two child nodes as the two tree (starting points).
tss = TSS(node_t1=s.root.children[0], node_t2=s.root.children[1], include_subjects=True)
tss.get_tree_similarity()

0.6372549019607843

In [10]:
# the two trees are 63% similar. Let's try to understand this number.
ted, edits = tss.get_tree_edit_distance()
print(f"The edit distance is : {ted}")
print(f"The edits are: {edits}")

The edit distance is : 0.7254901960784313
The edits are: {'delete': 0, 'insert': 0, 'rename': 2}


In [11]:
# let's call the explain_tree_edit_distance method to understand the Tree Edit Distance, first.
tss.explain_tree_edit_distance(verbose=True)

Step: 4: === Rename. Contribution to distance: 0.058823529411764705
(Node('/root/s0/p0/property', type='property', value='scope 1 emissions'), Node('/root/s1/p0/property', type='property', value='scope 2 emissions'))
Step: 5: === Rename. Contribution to distance: 0.6666666666666666
(Node('/root/s0/p0/property_value', type='property_value', value='5,558'), Node('/root/s1/p0/property_value', type='property_value', value='41,757'))
 ===== 
Missing steps may be due to: (1) nodes with none values or (2) rename of node with same type and value but different location in tree. To see all steps anyway, use 'explain_tree_edit_distance' and set vv to True.
  ===== 



{'delete': 0, 'insert': 0, 'rename': 2}

In [12]:
# we see that the two edits are:
# 'scope 1 emissions' to 'scope 2 emissions'
# '5,558' to '41,757'
# these string edits contribute via their normalized Levenstein edit distance.
# the individual contributions are: 
contribution_1 = 1/len('scope 2 emissions') 
# here the first numerator has 1, because only one character was changed
contribution_2 = 4/len('41,757')
# and the second numerator has 4, because 4 characters were changed
print(f"Edit distance for property: {contribution_1}")
print(f"Edit distance for property value : {contribution_2}")

Edit distance for property: 0.058823529411764705
Edit distance for property value : 0.6666666666666666


In [13]:
ted = contribution_1 + contribution_2
normalized_ted = ted/(2) # total edits = 2
tss = 1-normalized_ted
print(ted, normalized_ted, tss)

0.7254901960784313 0.3627450980392157 0.6372549019607843
