# Testing REBEL-large on the AI Act

In [14]:
from transformers import pipeline
from rebel_re_model import extract_triplets

In [15]:
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# Get sample AI Act text
with open('sample_knowledge_graph_section.txt', 'r') as file:
    # Read the file content into a string
    ai_act_string = file.read()

ai_act_string

'A variety of AI systems can generate large quantities of synthetic content that becomes increasingly hard for humans to distinguish from human-generated and authentic content. The wide availability and increasing capabilities of those systems have a significant impact on the integrity and trust in the information ecosystem, raising new risks of misinformation and manipulation at scale, fraud, impersonation and consumer deception. In the light of those impacts, the fast technological pace and the need for new methods and techniques to trace origin of information, it is appropriate to require providers of those systems to embed technical solutions that enable marking in a machine readable format and detection that the output has been generated or manipulated by an AI system and not a human. Such techniques and methods should be sufficiently reliable, interoperable, effective and robust as far as this is technically feasible, taking into account available techniques or a combination of s

## Testing on the sample AI Act paragraph

In [17]:
# We need to use the tokenizr manually since we need special tokens.
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(ai_act_string, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
extracted_text[0]

'<s><triplet> impersonation <subj> fraud <obj> subclass of</s>'

## Testing on a single sentence from the AI Act

In [20]:
# Testing on a shorter string
ai_act_one_sentence = "A variety of AI systems can generate large quantities of synthetic content that becomes increasingly hard for humans to distinguish from human-generated and authentic content."

extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(ai_act_one_sentence,  return_tensors=True, return_text=False)[0]["generated_token_ids"]])
extracted_text[0]

'<s><triplet> synthetic content <subj> authentic content <obj> opposite of <triplet> authentic content <subj> synthetic content <obj> opposite of</s>'

## Testing on three sentences from the AI Act

In [23]:
ai_act_three_sentences = "A variety of AI systems can generate large quantities of synthetic content that becomes increasingly hard for humans to distinguish from human-generated and authentic content. The wide availability and increasing capabilities of those systems have a significant impact on the integrity and trust in the information ecosystem, raising new risks of misinformation and manipulation at scale, fraud, impersonation and consumer deception."

extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(ai_act_three_sentences,  return_tensors=True, return_text=False)[0]["generated_token_ids"]])
extracted_text[0]

'<s><triplet> impersonation <subj> fraud <obj> subclass of</s>'

# First Impresions

The model seems to only output one or two relations, regardless of the input length.

## Passing in the sample paragraph one sentence at a time

In [26]:
sentences = ai_act_string.split(".")

# The last sentence is empty, let's test what the model outputs
sentences[7]

''

In [24]:
for sentence in sentences:
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(sentence,  return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    print(extracted_text[0])

<s><triplet> synthetic content <subj> authentic content <obj> opposite of <triplet> authentic content <subj> synthetic content <obj> opposite of</s>
<s><triplet> impersonation <subj> fraud <obj> subclass of</s>
<s><triplet> AI <subj> AI system <obj> studies <triplet> AI system <subj> AI <obj> studied by</s>
<s><triplet> watermark <subj> metadata <obj> subclass of</s>
<s><triplet> state-of-the-art <subj> technological <obj> instance of</s>
<s><triplet> AI model <subj> model <obj> subclass of</s>
<s><triplet> assistive function <subj> AI systems <obj> subclass of</s>
<s><triplet> World War I <subj> World War II <obj> followed by <triplet> World War II <subj> World War I <obj> follows</s>


# Hallucinaitons
The model hallucinate relations if passed in an empty string. For example, the model outputted the triplet (World War I, followed by, World War II)

# Passing in longer sections (approx 10 pages) of the AI Act 

In [27]:
with open("ai-act-10-pages.txt", "r") as file:
    long_ai_act_string = file.read()

long_ai_act_string = long_ai_act_string.replace("\n", " ")
long_ai_act_string

'1. High-risk AI systems shall comply with the requirements established in this Chapter, taking into account its intended purpose as well as the generally acknowledged state of the art on AI and AI related technologies. The risk management system referred to in Article 9 shall be taken into account when ensuring compliance with those requirements. 2a. Where a product contains an artificial intelligence system, to which the requirements of this Regulation as well as requirements of the Union harmonisation legislation listed in Annex II, Section A apply, providers shall be responsible for ensuring that their product is fully compliant with all applicable requirements required under the Union harmonisation legislation. In ensuring the compliance of high-risk AI systems referred in paragraph 1 with the requirements set out in Chapter 2 of this Title, and in order to ensure consistency, avoid duplications and minimise additional burdens, providers shall have a choice to integrate, as approp

In [33]:
preprocessed_sentences = long_ai_act_string.split(". ")
preprocessed_sentences

['1',
 'High-risk AI systems shall comply with the requirements established in this Chapter, taking into account its intended purpose as well as the generally acknowledged state of the art on AI and AI related technologies',
 'The risk management system referred to in Article 9 shall be taken into account when ensuring compliance with those requirements',
 '2a',
 'Where a product contains an artificial intelligence system, to which the requirements of this Regulation as well as requirements of the Union harmonisation legislation listed in Annex II, Section A apply, providers shall be responsible for ensuring that their product is fully compliant with all applicable requirements required under the Union harmonisation legislation',
 'In ensuring the compliance of high-risk AI systems referred in paragraph 1 with the requirements set out in Chapter 2 of this Title, and in order to ensure consistency, avoid duplications and minimise additional burdens, providers shall have a choice to inte

In [34]:
# Filter out short lines
sentences = [string for string in preprocessed_sentences if len(string) > 7]
sentences

['High-risk AI systems shall comply with the requirements established in this Chapter, taking into account its intended purpose as well as the generally acknowledged state of the art on AI and AI related technologies',
 'The risk management system referred to in Article 9 shall be taken into account when ensuring compliance with those requirements',
 'Where a product contains an artificial intelligence system, to which the requirements of this Regulation as well as requirements of the Union harmonisation legislation listed in Annex II, Section A apply, providers shall be responsible for ensuring that their product is fully compliant with all applicable requirements required under the Union harmonisation legislation',
 'In ensuring the compliance of high-risk AI systems referred in paragraph 1 with the requirements set out in Chapter 2 of this Title, and in order to ensure consistency, avoid duplications and minimise additional burdens, providers shall have a choice to integrate, as app

In [51]:
# Takes about 2 mins to run
triplets = []
from rebel_re_model import extract_triplets

for sentence in sentences:
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(sentence,  return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    triplets.append(extract_triplets(extracted_text[0]))

triplets

[[{'head': 'AI', 'type': 'studied by', 'tail': 'AI related technologies'},
  {'head': 'AI related technologies', 'type': 'studies', 'tail': 'AI'}],
 [{'head': 'risk management', 'type': 'subclass of', 'tail': 'system'}],
 [{'head': 'Union harmonisation legislation',
   'type': 'has part',
   'tail': 'Annex II, Section A'},
  {'head': 'Annex II, Section A',
   'type': 'part of',
   'tail': 'Union harmonisation legislation'},
  {'head': 'Annex II, Section A',
   'type': 'part of',
   'tail': 'Union harmonisation legislation'},
  {'head': 'Union harmonisation legislation',
   'type': 'has part',
   'tail': 'Annex II, Section A'}],
 [{'head': 'Union harmonisation legislation',
   'type': 'has part',
   'tail': 'Annex II, Section A'},
  {'head': 'Annex II, Section A',
   'type': 'part of',
   'tail': 'Union harmonisation legislation'}],
 [{'head': 'Risk management system',
   'type': 'use',
   'tail': 'Risk management'}],
 [{'head': 'AI systems', 'type': 'use', 'tail': 'risk management'}],


In [57]:
# triplets is a list of lists of dictionaries
# I want each list to only contain the first dictionary
triplets2 = [item[0] for item in triplets]
triplets2

[{'head': 'AI', 'type': 'studied by', 'tail': 'AI related technologies'},
 {'head': 'risk management', 'type': 'subclass of', 'tail': 'system'},
 {'head': 'Union harmonisation legislation',
  'type': 'has part',
  'tail': 'Annex II, Section A'},
 {'head': 'Union harmonisation legislation',
  'type': 'has part',
  'tail': 'Annex II, Section A'},
 {'head': 'Risk management system', 'type': 'use', 'tail': 'Risk management'},
 {'head': 'AI systems', 'type': 'use', 'tail': 'risk management'},
 {'head': 'continuous iterative process',
  'type': 'facet of',
  'tail': 'risk management system'},
 {'head': 'health, safety or fundamental rights',
  'type': 'has part',
  'tail': 'Article 61'},
 {'head': 'design', 'type': 'part of', 'tail': 'development'},
 {'head': 'minimising risks',
  'type': 'subclass of',
  'tail': 'risk management'},
 {'head': 'hazard', 'type': 'studied by', 'tail': 'risk management'},
 {'head': 'risk management', 'type': 'studies', 'tail': 'risk'},
 {'head': 'training', 'typ

# Visualising the extracted relations

In [59]:
import networkx as nx
from pyvis.network import Network


# Create a directed graph
G = nx.DiGraph()

# Iterate through data to add edges to the graph
for item in triplets2:
    # G.add_edge(subject, obj, label=relation)
    G.add_edge(item["head"], item["tail"], label=item["type"])
    
# Initialize PyVis network
net = Network(notebook=False, height="750px", width="100%")
net.from_nx(G)

# Customize the visualization
net.show_buttons(filter_=['physics'])
net.toggle_physics(True)

# Generate and display the interactive graph
net.show("knowledge_graph.html")
