# ASTrED playground

In [22]:
from detectors.syntax_sim import astredRunner, astredAndre

  from .autonotebook import tqdm as notebook_tqdm


## Metrics

### Metric codes

#### Sacr Cross score

In [25]:
def sacr_cross_score(astred_obj, round_bool=False, round_num=2):
    ''''''
    # The final SACr value is the number of crossing alignment links between 
    # the source and target SACr groups, normalised by the number of these alignments. ~Vanroy et al.
    score = len(astred_obj.no_null_word_pairs) / astred_obj.src.sacr_cross
    if round_bool:
        return round(score, round_num)
    else:
        return score

#### Label Change Score

In [26]:
def label_changes_score(astred_obj, round_bool=False, round_num=2, verbose=False):
    ''''''
    # We look at each source word and compare its label 
    # to the labels of the words that it is aligned to.
    # These label changes are then normalised by 
    # the total number of alignments ~Vanroy et al.
    change_list = []
    for src, tgt in astred_obj.no_null_word_pairs:
        if src.deprel == tgt.deprel:
            change = 0
        else:
            change = 1
        change_list.append(change)
        if verbose:
            print(f'\'{src.text}\'({src.deprel}) | \'{tgt.text}\'({tgt.deprel}) | {change}')

    score = sum(change_list) / len(astred_obj.no_null_word_pairs)

    if round_bool:
        score = round(score, round_num)

    if verbose:
        print(f'Total: {sum(change_list)} (normalised: {sum(change_list)} out of {len(astred_obj.no_null_word_pairs)} = {score})')
    
    return score

#### ASTrED Score

In [27]:
from statistics import mean


def astred_score(astred_obj, en_sent, nl_sent, round_bool=False, round_num=2, verbose=False):
    ''''''
    # Use dependency trees with UD labels on grouped source-target tokens to retrieve 
    # the amount of steps necessary for both source and target trees to become the same as the other.
    # Normalised by taking the total scores of all trees by the average of source and target words.
    src_astred_score = []
    tgt_astred_score = []
    for src, tgt in astred_obj.no_null_word_pairs:
        src_astred_score.append(src.tree.astred_cost)
        tgt_astred_score.append(tgt.tree.astred_cost)
        if verbose:
            print(src.text, src.tree.astred_op, tgt.text, tgt.tree.astred_op)

    score = (
        (sum(src_astred_score) + sum(tgt_astred_score)) 
         / mean([len(en_sent.no_null_words), len(nl_sent.no_null_words)])
    )

    if round_bool:
        score = round(score, round_num)

    if verbose:
        print('')

    return score

### Example inputs

In [28]:
sent_dict = {
    'name': 'test_1',
    'en': 'Sometimes she asks me why I used to call her father Harold .',
    'nl': 'Soms vraagt ze waarom ik haar vader Harold noemde .',
    'aligns': '0-0 1-2 2-1 4-3 5-4 8-8 9-5 10-6 11-7 12-9',
}

astred_sacr = astredAndre(
    en=sent_dict['en'],
    nl=sent_dict['nl'],
    aligns=sent_dict['aligns'],
    name=sent_dict['name'],
)

In [30]:
sent_dict = {
    'name': 'test_2',
    'en': 'I saw him .',
    'nl': 'Hij werd door mij gezien .',
    'aligns': '0-2 0-3 1-1 1-4 2-0',
}

astred_label = astredAndre(
    en=sent_dict['en'],
    nl=sent_dict['nl'],
    aligns=sent_dict['aligns'],
    name=sent_dict['name'],
)

In [51]:
sent_dict = {
    'name': 'test_3',
    'en': 'Does he believe in love ?',
    'nl': 'Gelooft hij in de liefde ?',
    'aligns': '0-0 1-1 2-0 3-2 4-3 4-4 5-5',
}

astred_astred_score = astredAndre(
    en=sent_dict['en'],
    nl=sent_dict['nl'],
    aligns=sent_dict['aligns'],
    name=sent_dict['name'],
)

### Example metric outputs

In [81]:
sacr_cross_score(astred_sacr.aligned, round_bool=True)

3.33

In [83]:
label_changes_score(astred_label.aligned, verbose=True)

'I'(nsubj) | 'door'(case) | 1
'I'(nsubj) | 'mij'(obl) | 1
'saw'(root) | 'werd'(aux) | 1
'saw'(root) | 'gezien'(root) | 0
'him'(obj) | 'Hij'(nsubj) | 1
Total: 4 (normalised: 4 out of 5 = 0.8)


0.8

In [86]:
astred_score(astred_astred_score.aligned, astred_astred_score.sent_en, astred_astred_score.sent_nl)

0.3333333333333333

## General testing

In [3]:
import pandas as pd

# for src, tgt in astred.aligned.no_null_word_pairs:
#     print(src.text, tgt.text)

df = pd.DataFrame.from_dict({src.text: [sent_dict['name'], tgt.text, src.deprel, src.cross, src.sacr_group.cross, src.num_changes(), src.tree.astred_op]
                                 for src, tgt in astred.aligned.no_null_word_pairs},
        orient="index",
        columns=["sent_name", "aligned_tgt", "deprel", "cross", "sacr_cross", "dep_changes", "astred_op"])
# df = pd.DataFrame.from_dict({src.text: [tgt.text, 
#                                         src.deprel, tgt.deprel, 
#                                         src.cross, tgt.cross, 
#                                         src.sacr_group.cross, tgt.sacr_group.cross, 
#                                         src.num_changes(), tgt.num_changes(),
#                                         src.tree.astred_op, tgt.tree.astred_op]
#                                  for src, tgt in astred.aligned.no_null_word_pairs},
#         orient="index",
#         columns=[
#             "aligned_tgt",
#             "deprel_src", "deprel_tgt", 
#             "cross_src", "cross_tgt",
#             "sacr_cross_src", "sacr_cross_tgt",
#             "dep_changes_src", "dep_changes_tgt", 
#             "astred_op_src", "astred_op_tgt"
#         ])
df.index.name = 'aligned_src'

display(df)

Unnamed: 0_level_0,sent_name,aligned_tgt,deprel,cross,sacr_cross,dep_changes,astred_op
aligned_src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
The,test_1,de,det,3,3,1,match
show,test_1,voorstelling,nsubj,4,4,1,deletion
is,test_1,is,aux,2,2,1,match
billed,test_1,is,root,2,2,1,rename
as,test_1,is,case,2,2,1,rename
the,test_1,het,det,3,3,0,match
museum,test_1,museum,nmod,3,3,1,match
's,test_1,in,case,4,3,0,deletion
largest,test_1,duurste,obl,4,3,1,deletion
ever,test_1,ooit,advmod,3,2,0,deletion


In [26]:
df_grp = df.reset_index().groupby(['sent_name']).agg(list)

In [27]:
display(df_grp)

Unnamed: 0_level_0,aligned_src,aligned_tgt,deprel,cross,sacr_cross,dep_changes,astred_op
sent_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
test_1,"[The, show, is, billed, as, the, museum, 's, l...","[de, voorstelling, is, is, is, het, museum, in...","[det, nsubj, aux, root, case, det, nmod, case,...","[3, 4, 2, 2, 2, 3, 3, 4, 4, 3, 0]","[3, 4, 2, 2, 2, 3, 3, 3, 3, 2, 0]","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0]","[match, deletion, match, rename, rename, match..."


In [5]:
df_src, df_tgt = astred.data_frame()
display(df_src)
display(df_tgt)


 data_frame 



Unnamed: 0,deprel,cross,sacr_cross,dep_changes,astred_op
The,det,3,3,1,match
show,nsubj,4,4,1,deletion
is,aux,2,2,1,match
billed,root,2,2,1,rename
as,case,2,2,1,rename
the,det,3,3,0,match
museum,nmod,3,3,1,match
's,case,4,3,0,deletion
largest,obl,4,3,1,deletion
ever,advmod,3,2,0,deletion


Unnamed: 0,deprel,cross,sacr_cross,dep_changes,astred_op
Dit,nsubj,0,0,1,match
is,cop,6,6,3,match
de,det,3,3,0,rename
duurste,amod,4,3,1,deletion
voorstelling,root,4,4,1,rename
ooit,advmod,3,2,0,deletion
in,case,4,3,0,deletion
het,det,3,3,0,match
museum,obl,3,3,1,match
.,punct,0,0,0,match


In [6]:
astred.simple_analysis()
astred.is_changed()
astred.span_root()


 simple_analysis 

Dit The nsubj det
is is cop aux
is billed cop root
is as cop case
de The det det
duurste largest amod obl
voorstelling show root nsubj
ooit ever advmod advmod
in 's case case
het the det det
museum museum obl nmod
. . punct punct

 is_changed 

Dutch: is AUX
Aligned: is AUX False
Aligned: billed VERB True
Aligned: as ADP True

 span_root 

The The
show show
is is
billed billed
as as
the museum museum
's 's
largest largest
ever ever
. .


# Samples new_run

In [9]:
from pathlib import Path
import pandas as pd

new_run_lfa = sorted(Path('../data/2_new_run/').glob('*/**/*.lfa'))
new_run_wa = sorted(Path('../data/2_new_run/').glob('*/**/*.wa'))
with open(new_run_lfa[0], 'r', encoding='utf-8') as f:
    new_run_lfa_f = f.read().splitlines()

new_run_lfa_f = [i.split(' ||| ') for i in new_run_lfa_f if '' not in i.split(' ||| ')]

with open(new_run_wa[0], 'r', encoding='utf-8') as f:
    new_run_wa_f = f.read().splitlines()

new_run_wa_f = [i for i in new_run_wa_f if i != '']



In [16]:
# print(new_run_lfa_f[0], '\n', new_run_wa_f[0])
for i in new_run_wa_f[0].split(' '):
    print(
        new_run_lfa_f[0][0].split(' ')[int(i.split('-')[0])],
        new_run_lfa_f[0][1].split(' ')[int(i.split('-')[1])]
    )
    # print(i)

The De
Inquisition Inquisitie
has heeft
delivered overgedragen
Spain Spanje
to aan
the de
Templars Tempeliers
. .


In [20]:
sent_en = new_run_lfa_f[0][0]
sent_nl = new_run_lfa_f[0][1]
aligns = new_run_wa_f[0]

In [23]:
ac_test = astredAndre(sent_en, sent_nl, aligns, new_run_lfa[0].as_posix())

In [28]:
sacr_cross_score(ac_test.aligned)

9.0

In [31]:
label_changes_score(ac_test.aligned, verbose=True)

'The'(det) | 'De'(fixed) | 1
'Inquisition'(nsubj) | 'Inquisitie'(fixed) | 1
'has'(aux) | 'heeft'(aux) | 0
'delivered'(root) | 'overgedragen'(parataxis) | 1
'Spain'(obj) | 'Spanje'(obj) | 0
'to'(case) | 'aan'(case) | 0
'the'(det) | 'de'(det) | 0
'Templars'(obl) | 'Tempeliers'(obl) | 0
'.'(punct) | '.'(punct) | 0
Total: 3 (normalised: 3 out of 9 = 0.3333333333333333)


0.3333333333333333

In [30]:
astred_score(ac_test.aligned, ac_test.sent_en, ac_test.sent_nl)

0.22857142857142856