In [1]:
from parse_EU_act import parse_EU_act

ai_act = parse_EU_act("ai_act/ai-act-draft.pdf")

In [2]:
ai_act



In [3]:
import re
from auto_label_function import set_to_lower, filter_text, paragraph_to_labeled_sentences, better_studio

set_to_lower()

ai_act = filter_text(ai_act)
ai_act = ai_act.replace("\n", " ")
ai_act = ai_act.replace("  ", " ")
sentences = re.split(r'[.;]', ai_act)
sentences = [s.strip() for s in sentences if len(s.strip()) > 7]
for i in range(len(sentences)):
    sentences[i] = sentences[i].replace("\n", "")
    while len(sentences[i]) != 0 and (sentences[i][0] == " "):
        sentences[i] = sentences[i][1:]
sentences[:5]

['INTRODUCTION The Commission adopted the proposal for a Regulation laying down harmonised rules on artificial intelligence (Artificial Intelligence Act, hereinafter: the AI Act) on 21 April 2021',
 'The Council unanimously adopted its General Approach on the proposal on 6 December 2022, while the European Parliament (hereinafter: the EP) confirmed its position in a plenary vote on 14 June 2023',
 'On 14 June 2023, 18 July 2023, 2-3 October 2023 and 24 October 2023 the first four political trilogues were held, during which some of the less controversial parts of the proposal were agreed and compromise was also found on the provisions concerning measures in support of innovation, as well as and on the mechanism for classification of AI systems as high-risk',
 'Moreover, during those initial trilogues the co-legislators explored potential landing zones with regard to the remaining issues, in particular the regulation of general purpose AI models and systems, governance, as well the prohi

In [4]:
long_sent = max(sentences, key=len)
print(long_sent)

_______________________ ANNEX 2021/0106 (COD) Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL LAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE (ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION LEGISLATIVE ACTS THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof, Having regard to the proposal from the European Commission, After transmission of the draft legislative act to the national parliaments, Having regard to the opinion of the European Economic and Social Committee1, Having regard to the opinion of the European Central Bank2, Having regard to the joint opinion of the European Data Protection Board and the European Data Protection Supervisor, Having regard to the opinion of the Committee of the Regions3, Acting in accordance with the ordinary legislative procedure, Whereas: (1) The purpose of this Regulation is to impro

In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-large")

In [6]:
# Sliding Window
def sliding_window(sent_array, window_size, stride):
    # TODO: Add in tokenizer check
    windows = []
    i = 0
    while i < len(sent_array):
        this_arr_len = min(window_size, len(sent_array) - i)
        windows.append(sent_array[i:i+this_arr_len])
        i += stride
    return windows

In [7]:
long_sent_tokenized = tokenizer.tokenize(long_sent)
len(long_sent_tokenized)

330

In [8]:
# Split sentences into paragraphs of max 512 tokens
paragraphs = []
i = 0
while i < len(sentences):
    this_sentence = ""
    total_tokens = 0
    while total_tokens < 512 and i < len(sentences):
        new_sentence = this_sentence
        if new_sentence != "":
            new_sentence += " "
        new_sentence += sentences[i] + "."
        new_total_tokens = len(tokenizer.tokenize(new_sentence))
        if new_total_tokens < 512:
            this_sentence = new_sentence
            i += 1
            total_tokens = new_total_tokens
        else:
            paragraphs.append(this_sentence)
            break
paragraphs

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


['INTRODUCTION The Commission adopted the proposal for a Regulation laying down harmonised rules on artificial intelligence (Artificial Intelligence Act, hereinafter: the AI Act) on 21 April 2021. The Council unanimously adopted its General Approach on the proposal on 6 December 2022, while the European Parliament (hereinafter: the EP) confirmed its position in a plenary vote on 14 June 2023. On 14 June 2023, 18 July 2023, 2-3 October 2023 and 24 October 2023 the first four political trilogues were held, during which some of the less controversial parts of the proposal were agreed and compromise was also found on the provisions concerning measures in support of innovation, as well as and on the mechanism for classification of AI systems as high-risk. Moreover, during those initial trilogues the co-legislators explored potential landing zones with regard to the remaining issues, in particular the regulation of general purpose AI models and systems, governance, as well the prohibitions a

In [9]:
length_in_tokens = [len(tokenizer.tokenize(sentence)) for sentence in paragraphs]
max(length_in_tokens)

511

In [10]:
auto_ner_output = [better_studio(*paragraph_to_labeled_sentences(paragraph)) for paragraph in paragraphs]
auto_ner_output[0]

[{'id': 2,
  'data': {'text': 'INTRODUCTION The Commission adopted the proposal for a Regulation laying down harmonised rules on artificial intelligence (Artificial Intelligence Act, hereinafter: the AI Act) on 21 April 2021'},
  'predictions': [{'result': [{'value': {'start': 13,
       'end': 27,
       'text': 'The Commission',
       'labels': ['ORG']},
      'id': 'oxcjtskrewddaiyp',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'auto'},
     {'value': {'start': 55,
       'end': 65,
       'text': 'Regulation',
       'labels': ['DOC']},
      'id': 'ncymtktgxpjqoybc',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'auto'},
     {'value': {'start': 98,
       'end': 121,
       'text': 'artificial intelligence',
       'labels': ['ALG']},
      'id': 'wloanlvnbqubdlxj',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'auto'},
     {'value': {'s

In [11]:
# Filter the dataset for empty entities
for paragraph in auto_ner_output:
    indices_to_remove = []
    for i, entity in enumerate(paragraph[0]["predictions"][0]["result"]):
        if not entity["value"]["text"].strip():
            indices_to_remove.append(i)
    for index in sorted(indices_to_remove, reverse=True):
        del paragraph[0]["predictions"][0]["result"][index]

In [12]:
auto_ner_output[66]

[{'id': 2,
  'data': {'text': 'Within this framework, providers of very large online platforms and very large search engines are obliged to assess potential systemic risks stemming from the design, functioning and use of their services, including how the design of algorithmic systems used in the service may contribute to such risks, as well as systemic risks stemming from potential misuses'},
  'predictions': [{'result': [{'value': {'start': 23,
       'end': 32,
       'text': 'providers',
       'labels': ['ORG']},
      'id': 'ifsaiyvqpcbpxias',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'auto'},
     {'value': {'start': 135, 'end': 140, 'text': 'risks', 'labels': ['ETH']},
      'id': 'ewazlkzbabtdywhx',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'auto'},
     {'value': {'start': 246,
       'end': 253,
       'text': 'systems',
       'labels': ['SYS']},
      'id': 'beoiqszhaqbmvhh

In [13]:
from auto_ner_to_docred import auto_ner_to_docred

# for i in range(len(auto_ner_output)):
#     print(i)
#     auto_ner_to_docred(auto_ner_output[i])

docred_format = [auto_ner_to_docred(sentence)[0] for sentence in auto_ner_output]
docred_format

[{'vertexSet': [[{'pos': [1, 3],
     'type': 'ORG',
     'sent_id': 0,
     'name': 'The Commission'}],
   [{'pos': [8, 9], 'type': 'DOC', 'sent_id': 0, 'name': 'Regulation'}],
   [{'pos': [14, 16],
     'type': 'ALG',
     'sent_id': 0,
     'name': 'artificial intelligence'}],
   [{'pos': [17, 19],
     'type': 'ALG',
     'sent_id': 0,
     'name': 'Artificial Intelligence'}],
   [{'pos': [24, 25], 'type': 'ALG', 'sent_id': 0, 'name': 'AI'}]],
  'sents': [['INTRODUCTION',
    'The',
    'Commission',
    'adopted',
    'the',
    'proposal',
    'for',
    'a',
    'Regulation',
    'laying',
    'down',
    'harmonised',
    'rules',
    'on',
    'artificial',
    'intelligence',
    '(',
    'Artificial',
    'Intelligence',
    'Act',
    ',',
    'hereinafter',
    ':',
    'the',
    'AI',
    'Act',
    ')',
    'on',
    '21',
    'April',
    '2021'],
   ['The',
    'Council',
    'unanimously',
    'adopted',
    'its',
    'General',
    'Approach',
    'on',
    'the',


In [14]:
docred_format_filtered = [element for element in docred_format if len(element["vertexSet"]) > 1 and len(element["vertexSet"][0]) > 1]

In [15]:
docred_format_filtered[4]

{'vertexSet': [[{'pos': [8, 9],
    'type': 'ORG',
    'sent_id': 0,
    'name': 'operator'},
   {'pos': [19, 20], 'type': 'ORG', 'sent_id': 0, 'name': 'operator'},
   {'pos': [10, 11], 'type': 'ORG', 'sent_id': 1, 'name': 'operator'},
   {'pos': [31, 32], 'type': 'ORG', 'sent_id': 1, 'name': 'operator'}],
  [{'pos': [12, 13], 'type': 'ORG', 'sent_id': 0, 'name': 'Union'},
   {'pos': [23, 24], 'type': 'ORG', 'sent_id': 0, 'name': 'Union'},
   {'pos': [13, 14], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [24, 25], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [34, 35], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [63, 64], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [19, 20], 'type': 'ORG', 'sent_id': 2, 'name': 'Union'},
   {'pos': [57, 58], 'type': 'ORG', 'sent_id': 2, 'name': 'Union'},
   {'pos': [64, 65], 'type': 'ORG', 'sent_id': 3, 'name': 'Union'},
   {'pos': [20, 21], 'type': 'ORG', 'sent_id': 5, 'name': 'Union'},
   {'pos': [

In [16]:
for element in docred_format_filtered:
    element.pop("labels", None)
    
for i in range(len(docred_format_filtered)):
    docred_format_filtered[i]["title"] += f", {i}"

docred_format_filtered[4]


{'vertexSet': [[{'pos': [8, 9],
    'type': 'ORG',
    'sent_id': 0,
    'name': 'operator'},
   {'pos': [19, 20], 'type': 'ORG', 'sent_id': 0, 'name': 'operator'},
   {'pos': [10, 11], 'type': 'ORG', 'sent_id': 1, 'name': 'operator'},
   {'pos': [31, 32], 'type': 'ORG', 'sent_id': 1, 'name': 'operator'}],
  [{'pos': [12, 13], 'type': 'ORG', 'sent_id': 0, 'name': 'Union'},
   {'pos': [23, 24], 'type': 'ORG', 'sent_id': 0, 'name': 'Union'},
   {'pos': [13, 14], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [24, 25], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [34, 35], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [63, 64], 'type': 'ORG', 'sent_id': 1, 'name': 'Union'},
   {'pos': [19, 20], 'type': 'ORG', 'sent_id': 2, 'name': 'Union'},
   {'pos': [57, 58], 'type': 'ORG', 'sent_id': 2, 'name': 'Union'},
   {'pos': [64, 65], 'type': 'ORG', 'sent_id': 3, 'name': 'Union'},
   {'pos': [20, 21], 'type': 'ORG', 'sent_id': 5, 'name': 'Union'},
   {'pos': [

In [17]:
import json

with open("ai_act_docred_format.json", "w") as outfile:
    json.dump(docred_format_filtered, outfile, indent=4)

In [18]:
with open("short_ai_act_docred_format.json", "w") as outfile:
    json.dump(docred_format_filtered[:10], outfile, indent=4)

In [19]:
len(docred_format_filtered)

101

In [20]:
docred_format_filtered[77]

{'vertexSet': [[{'pos': [1, 3],
    'type': 'ORG',
    'sent_id': 0,
    'name': 'scientific panel'},
   {'pos': [76, 78], 'type': 'ORG', 'sent_id': 0, 'name': 'scientific panel'}],
  [{'pos': [9, 10], 'type': 'ALG', 'sent_id': 0, 'name': 'AI'},
   {'pos': [36, 37], 'type': 'ALG', 'sent_id': 0, 'name': 'AI'},
   {'pos': [61, 62], 'type': 'ALG', 'sent_id': 0, 'name': 'AI'},
   {'pos': [17, 18], 'type': 'ALG', 'sent_id': 1, 'name': 'AI'},
   {'pos': [11, 12], 'type': 'ALG', 'sent_id': 2, 'name': 'AI'},
   {'pos': [12, 13], 'type': 'ALG', 'sent_id': 3, 'name': 'AI'}],
  [{'pos': [28, 29], 'type': 'ACT', 'sent_id': 0, 'name': 'enforcement'}],
  [{'pos': [31, 32], 'type': 'DOC', 'sent_id': 0, 'name': 'Regulation'}],
  [{'pos': [37, 38], 'type': 'SYS', 'sent_id': 0, 'name': 'models'},
   {'pos': [62, 63], 'type': 'SYS', 'sent_id': 0, 'name': 'models'},
   {'pos': [18, 19], 'type': 'SYS', 'sent_id': 1, 'name': 'models'},
   {'pos': [12, 13], 'type': 'SYS', 'sent_id': 2, 'name': 'models'},
   

In [23]:
with open("failing.json", "w") as outfile:
    json.dump(docred_format_filtered[76], outfile, indent=4)