In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("DFKI-SLT/few-nerd", "inter")

Shape of train, validation and test sets

In [3]:
train = dataset["train"]
train.shape

(130112, 4)

In [4]:
val = dataset["validation"] 
val.shape

(18817, 4)

In [5]:
test = dataset["test"] 
test.shape

(14007, 4)

NER and fine NER tags

In [6]:
ner_tags = train.features["ner_tags"].feature.names
ner_tags

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [7]:
# Find the index of a tag given the tag
index_person = ner_tags.index("person")
index_person

7

In [8]:
fine_ner_tags = train.features["fine_ner_tags"].feature.names
fine_ner_tags

['O',
 'art-broadcastprogram',
 'art-film',
 'art-music',
 'art-other',
 'art-painting',
 'art-writtenart',
 'building-airport',
 'building-hospital',
 'building-hotel',
 'building-library',
 'building-other',
 'building-restaurant',
 'building-sportsfacility',
 'building-theater',
 'event-attack/battle/war/militaryconflict',
 'event-disaster',
 'event-election',
 'event-other',
 'event-protest',
 'event-sportsevent',
 'location-GPE',
 'location-bodiesofwater',
 'location-island',
 'location-mountain',
 'location-other',
 'location-park',
 'location-road/railway/highway/transit',
 'organization-company',
 'organization-education',
 'organization-government/governmentagency',
 'organization-media/newspaper',
 'organization-other',
 'organization-politicalparty',
 'organization-religion',
 'organization-showorganization',
 'organization-sportsleague',
 'organization-sportsteam',
 'other-astronomything',
 'other-award',
 'other-biologything',
 'other-chemicalthing',
 'other-currency',
 'o

In [9]:
# Same as before but with fine NER
index_event_election = fine_ner_tags.index("event-election")
index_event_election

17

Closer look at the (training) dataset

In [10]:
train[10]

{'id': '10',
 'tokens': ['The',
  'City',
  'of',
  'Bradenton',
  'talked',
  'A',
  "'s",
  'owner',
  'Charlie',
  'Finley',
  'into',
  'staying',
  'at',
  'McKechnie',
  'until',
  '.'],
 'ner_tags': [0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0],
 'fine_ner_tags': [0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]}

In [11]:
train[10].get("id")

'10'

In [12]:
train[10].get("tokens")

['The',
 'City',
 'of',
 'Bradenton',
 'talked',
 'A',
 "'s",
 'owner',
 'Charlie',
 'Finley',
 'into',
 'staying',
 'at',
 'McKechnie',
 'until',
 '.']

In [13]:
train[10].get("ner_tags")

[0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]

In [14]:
train[10].get("fine_ner_tags")

[0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]

In [15]:
train_10_ner_tags = train[10].get("ner_tags")
print(train_10_ner_tags)

for i in train_10_ner_tags:
    print(ner_tags[i])

[0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]
O
location
location
location
O
O
O
O
O
O
O
O
O
organization
O
O


In [16]:
train_10_fine_ner_tags = train[10].get("fine_ner_tags")
print(train_10_fine_ner_tags)

for i in train_10_fine_ner_tags:
    print(fine_ner_tags[i])

[0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]
O
location-GPE
location-GPE
location-GPE
O
O
O
O
O
O
O
O
O
organization-sportsleague
O
O


If we want to work only with fine grained NER tags of a single category

In [17]:
# Only keep the sentences having person as ner tag
ner_tag_index = ner_tags.index("person")

def filter_only_person(example):

    has_ner_tag = False
    for tag in example["ner_tags"]:
        # skip non-entity tokens
        if tag == 0:
            continue
        if tag != ner_tag_index:
            return False
        else:
            has_person = True
    return has_person

# Filter the dataset
filtered_train_person = dataset["train"].filter(filter_only_person)

print("Number of sentences containing only the specified ner tag:", len(filtered_train_person))

Number of sentences containing only the specified ner tag: 9622


In [18]:
filtered_train_person[4].get("ner_tags")

[0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 0, 0, 7, 7, 0, 0, 7, 7, 0]

In [19]:
ner_tags[7]

'person'

If we want multiple ner tags

In [20]:
person_index = ner_tags.index("person")
location_index = ner_tags.index("location")

def filter_person_or_location(example):
    has_valid_ner_tag = False
    for tag in example["ner_tags"]:
        if tag == 0:
            continue
        if tag != person_index and tag != location_index:
            return False
        has_valid_ner_tag = True
    return has_valid_ner_tag # only keep if at least one valid entity tag is found

filtered_train_person_location = dataset["train"].filter(filter_person_or_location)

print("Number of sentences containing only the specified ner tags:", len(filtered_train_person_location))

Number of sentences containing only the specified ner tags: 49134


In [21]:
filtered_train_person_location[0].get("ner_tags")

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 0, 4, 4, 0, 4, 4, 0, 4, 4, 0]

In [22]:
def contains_both_person_and_location(example):
    has_person = False
    has_location = False
    for tag in example["ner_tags"]:
        if tag == person_index:
            has_person = True
        elif tag == location_index:
            has_location = True
        # If both are true
        if has_person and has_location:
            return True
    return False

filtered_both = filtered_train_person_location.filter(contains_both_person_and_location)

print("Number of sentences containing both the ner tags:", len(filtered_both))

Filter:   0%|          | 0/49134 [00:00<?, ? examples/s]

Number of sentences containing both the ner tags: 6220


In [23]:
filtered_both[0].get("ner_tags")

[7,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0]