In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

import vjp.data as data

## Data distribution

In [None]:
second_instance_samples = data.load_second_instance()

Samples are parsed as Python XML element trees. Queries can be done per-element through [the XPath syntax](https://docs.python.org/3/library/xml.etree.elementtree.html#supported-xpath-syntax). `vjp.data.findall` permits querying an entire list. All results from all queries are added to an output list.

In [None]:
print('Number of instances', len(second_instance_samples))
print('Number of second instance requests',
      len(data.findall(second_instance_samples, ".//partreq[@G='2']/req")[0]))

decisions, decision_mapping = data.findall(second_instance_samples,
                                           ".//courtdec[@G='2']/dec")
upheld, _ = data.findall(decisions, ".[@E='1']")
rejected, _ = data.findall(decisions, ".[@E='0']")
print('Number of second instance decisions', len(decisions))
print('Of which upheld', len(upheld))
print('Of which rejected', len(rejected))
print('Other outcomes', len(decisions) - len(upheld) - len(rejected))

Exploring requests IDS, claims and arguments to get a full picture of their interconnections shows that some values can be `None` and tagnames are not always consistent. Malformed statements could maybe be manually repaired, but they will be simply ignored for now.

In [None]:
reqs, _ = data.findall(second_instance_samples,
                       ".//partreq[@G='2']/req")
print('Request IDs', set(req.get('ID') for req in reqs), '\n')

claims, _ = data.findall(second_instance_samples,
                         ".//partreq[@G='2']/claim")
print('Claim PROs', set(claim.get('PRO') for claim in claims), '\n')

args, _ = data.findall(second_instance_samples,
                       ".//partreq[@G='2']/arg")
print('Arg PROs', set(arg.get('PRO') for arg in args), '\n')


Some decisions reference multiple requests. A decision that implies the result of multiple requests may be later splitted.

Some decisions reference claims. This shall be investigated (do they imply a label for some given requests?). The number is relatively small, they could be ignored. 

In [None]:
objects = tuple(map(lambda x: x.get('O'), decisions))
print(set(objects))
print('Number of claim objects: ',
      len(tuple(filter(lambda s: not s.startswith('Req'), objects))))

Given the very low amount of different outcomes, the problem will be treated as a binary classification one. Labels are not perfectly balanced.

In [None]:
plt.hist(tuple(map(lambda x: x.get('E'), decisions)))

Other outcomes can be dropped.

In [None]:
decisions = upheld + rejected
second_instance_samples = list(set(decision_mapping[decision]
                                   for decision in decisions))

Data will need some processing and handling of special cases. Some values can be `None` (empty tags?).

In [None]:
for decision in decisions[:10]:
    print(decision.text, '\n')

Some elements have multiple links. It is particularly interesting when it happens in decision tags, as each decision-request pair can form a new data sample. Iterative filtering and extraction of multiple links makes it possible to explore the whole "tree" of connections between nodes in the document, starting from decision tags and ending wherever it is decided.
A minimal set of features would be: request, arguments and claim tags, labeled by the corresponding decision tag, as defined by [Galli et. al, 2022].

In [None]:
multiple_link_decisions = tuple(filter(lambda e: '|' in e.get('O'), decisions))
print('Number of multilinked decisions', len(multiple_link_decisions))

## Graph tag representation

To facilitate the task of composing different feature sets based on how they are linked, each document is flattened into a set of triples, then used to build a graph.

In [None]:
triples_dfs = [data.build_tag_triples(sample)
               for sample in second_instance_samples]

In [None]:
print(triples_dfs[0].shape)
triples_dfs[0].head()

A rapid check for null values (the builder function comes without warranty)

In [None]:
check_null_list = [df.isnull().values.any() for df in triples_dfs]
print(any(check_null_list))

In [None]:
graphs = [nx.from_pandas_edgelist(triples, edge_attr='edge',
                                  create_using=nx.DiGraph())
          for triples in triples_dfs]
len(graphs)

In [None]:
graph = graphs[2].copy()

# Remove initial uninformative components
# for component in tuple(nx.connected_components(graph.to_undirected())):
#     if len(component) <= 2:
#         graph.remove_nodes_from(tuple(component))

# Remove finds for a cleaner representation
for node in tuple(graph.nodes):
    if node.lower().startswith('find'):
        graph.remove_node(node)

print('Connected components',
      tuple(nx.connected_components(graph.to_undirected())))

graph_pos = nx.planar_layout(graph)
nx.draw(graph, pos=graph_pos, node_size=0, font_size=10, with_labels=True,
        arrowsize=7)

In [None]:
PREFIX = 'req', 'arg', 'claim'
df_list = []

# graphs = graphs[3:5]
# second_instance_samples = second_instance_samples[3:5]

for graph, document in zip(graphs, second_instance_samples):
    for component in tuple(nx.connected_components(graph.to_undirected())):
        take = False
        label = -1
        for node in component:
            if node.lower().startswith('dec'):
                dec = document.find(f".//*[@ID='{node}']")

                try:
                    label = int(dec.get('E'))
                    if label not in (0, 1):
                        raise ValueError
                except ValueError:
                    continue
                    
                take = int(dec.get('G')) == 2
                break

        if not take:
            continue
        
        concat_lists = [[] for _ in PREFIX]
        for node in component:
            for i, prefix in enumerate(PREFIX):
                if node.lower().startswith(prefix):
                    node_element = document.find(f".//*[@ID='{node}']")
                    if node_element.text is not None:
                        concat_lists[i].append(node_element.text)

        fact_element = document.find(f".//fact")
        fact = ''
        if fact_element is not None:
            fact = fact_element.text

        req_prefix_index = PREFIX.index('req')
        for req_text in concat_lists[req_prefix_index]:
            df_list.append([fact, *map(' '.join, concat_lists), label])
            df_list[-1][1 + req_prefix_index] = req_text

df = pd.DataFrame(df_list, columns=['fact', *PREFIX, 'label'])
print(df.shape)

In [None]:
df.head()

# References


[Galli et. al, 2022]: Galli, F., Grundler, G., Fidelangeli, A., Galassi, A., Lagioia, F., Palmieri, E., Ruggeri, F., Sartor, G., & Torroni, P. (2022). Predicting outcomes of Italian VAT DECISIONS1. *Frontiers in Artificial Intelligence and Applications*. https://doi.org/10.3233/faia220465   