In [1]:
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [7]:
# Read the AI Act
with open("ai_act/ai-act.txt", "r") as file:
    ai_act = file.read()

# Split the AI Act into sentences
ai_act = ai_act.replace("\n", " ")
sentences = re.split(r'[.;:]', ai_act)
sentences = [s.strip() for s in sentences if len(s.strip()) > 7]
sentences[:5]

['INTRODUCTION  The  Commission  adopted  the  proposal  for  a  Regulation  laying  down  harmonised  rules  on  artificial intelligence (Artificial Intelligence Act, hereinafter',
 'the AI Act) on 21 April 2021',
 'The  Council  unanimously  adopted  its  General  Approach  on  the  proposal  on  6  December  2022, while the European Parliament (hereinafter',
 'the EP) confirmed its position in a plenary  vote on 14 June 2023',
 'On 14 June 2023, 18 July 2023, 2-3 October 2023 and 24 October 2023 the first four political  trilogues  were  held,  during  which  some  of  the  less  controversial  parts  of  the  proposal  were  agreed and compromise was also found on the provisions concerning measures in support of  innovation,  as  well  as  and  on  the  mechanism  for  classification  of  AI  systems  as  high-risk']

`sentences` is a list of strings

In [2]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
examples = ["My name is Wolfgang and I live in Berlin",
            "My name is Dylan and I live in Dublin"]


ner_results = nlp(examples)
ner_results

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[{'entity': 'B-PER',
   'score': 0.9971501,
   'index': 4,
   'word': 'Wolfgang',
   'start': 11,
   'end': 19},
  {'entity': 'B-LOC',
   'score': 0.9986046,
   'index': 9,
   'word': 'Berlin',
   'start': 34,
   'end': 40}],
 [{'entity': 'B-PER',
   'score': 0.99836797,
   'index': 4,
   'word': 'Dylan',
   'start': 11,
   'end': 16},
  {'entity': 'B-LOC',
   'score': 0.9990356,
   'index': 9,
   'word': 'Dublin',
   'start': 31,
   'end': 37}]]

In [3]:
tokens = [sentences.split() for sentences in examples]
tokens

[['My', 'name', 'is', 'Wolfgang', 'and', 'I', 'live', 'in', 'Berlin'],
 ['My', 'name', 'is', 'Dylan', 'and', 'I', 'live', 'in', 'Dublin']]

`ner_results` has an outer list (containing each sentence). Each sentence is itself a list of entities. Each entity is represented as a dictionary.

In [15]:
from datasets import load_dataset

docred = load_dataset("docred")

In [26]:
docred["train_annotated"][0]

{'title': 'AirAsia Zest',
 'sents': [['Zest',
   'Airways',
   ',',
   'Inc.',
   'operated',
   'as',
   'AirAsia',
   'Zest',
   '(',
   'formerly',
   'Asian',
   'Spirit',
   'and',
   'Zest',
   'Air',
   ')',
   ',',
   'was',
   'a',
   'low',
   '-',
   'cost',
   'airline',
   'based',
   'at',
   'the',
   'Ninoy',
   'Aquino',
   'International',
   'Airport',
   'in',
   'Pasay',
   'City',
   ',',
   'Metro',
   'Manila',
   'in',
   'the',
   'Philippines',
   '.'],
  ['It',
   'operated',
   'scheduled',
   'domestic',
   'and',
   'international',
   'tourist',
   'services',
   ',',
   'mainly',
   'feeder',
   'services',
   'linking',
   'Manila',
   'and',
   'Cebu',
   'with',
   '24',
   'domestic',
   'destinations',
   'in',
   'support',
   'of',
   'the',
   'trunk',
   'route',
   'operations',
   'of',
   'other',
   'airlines',
   '.'],
  ['In',
   '2013',
   ',',
   'the',
   'airline',
   'became',
   'an',
   'affiliate',
   'of',
   'Philippines',
   'A

# DocRED Format
Each element is a dictionary with the following keys:
- `title`: A short (few word) summary of the passage
- `sents`: A list of sentences (list of lists of words, where a sentence is a list of words)
- `vertexSet`: A list of entities, each entity has a list of its appearances in the passage (represented as a dictionary)

In [11]:
# TODO: Fix multi-word entities

def fix_labels(my_str):
    if my_str == "I-LOC" or my_str == "B-LOC":
        return "LOC"
    if my_str == "I-ORG" or my_str == "B-ORG":
        return "ORG"
    if my_str == "I-PER" or my_str == "B-PER":
        return "PER"
    return "MISC"


ner_json_format = [
    {"title": "Legal text about AI", 
     "sents": [tokens[i]], 
     "vertexSet": [[{
         "name": entity["word"], 
         "sent_id": 0, 
         "pos": [entity["start"], entity["start"] + len(entity["word"].split())], 
         "type": fix_labels(entity["entity"])}] for entity in ner_results[i]],
     "labels": {"head": [], "tail": [], "relation_id": [], "relation_text": [], "evidence": []}} for i in range(len(examples))]
ner_json_format

[{'title': 'Legal text about AI',
  'sents': [['My',
    'name',
    'is',
    'Wolfgang',
    'and',
    'I',
    'live',
    'in',
    'Berlin']],
  'vertexSet': [[{'name': 'Wolfgang',
     'sent_id': 0,
     'pos': [11, 12],
     'type': 'PER'}],
   [{'name': 'Berlin', 'sent_id': 0, 'pos': [34, 35], 'type': 'LOC'}]],
  'labels': {'head': [],
   'tail': [],
   'relation_id': [],
   'relation_text': [],
   'evidence': []}},
 {'title': 'Legal text about AI',
  'sents': [['My', 'name', 'is', 'Dylan', 'and', 'I', 'live', 'in', 'Dublin']],
  'vertexSet': [[{'name': 'Dylan',
     'sent_id': 0,
     'pos': [11, 12],
     'type': 'PER'}],
   [{'name': 'Dublin', 'sent_id': 0, 'pos': [31, 32], 'type': 'LOC'}]],
  'labels': {'head': [],
   'tail': [],
   'relation_id': [],
   'relation_text': [],
   'evidence': []}}]

In [41]:
import json

with open("ner_output.json", 'w') as file:
    json.dump(ner_json_format, file, indent=4)