In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("DFKI-SLT/few-nerd", "inter") # you can set to supervised, inter and intra

# inter: same coarse-grained categories
# intra: different coarse-grained categories

Shape of train, validation and test sets

In [3]:
train = dataset["train"]
train.shape

(130112, 4)

In [4]:
val = dataset["validation"] 
val.shape

(18817, 4)

In [5]:
test = dataset["test"] 
test.shape

(14007, 4)

NER and fine NER tags

In [6]:
ner_tags = train.features["ner_tags"].feature.names
ner_tags

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [7]:
# Find the index of a tag given the tag
index_person = ner_tags.index("person")
index_person

7

In [8]:
fine_ner_tags = train.features["fine_ner_tags"].feature.names
fine_ner_tags

['O',
 'art-broadcastprogram',
 'art-film',
 'art-music',
 'art-other',
 'art-painting',
 'art-writtenart',
 'building-airport',
 'building-hospital',
 'building-hotel',
 'building-library',
 'building-other',
 'building-restaurant',
 'building-sportsfacility',
 'building-theater',
 'event-attack/battle/war/militaryconflict',
 'event-disaster',
 'event-election',
 'event-other',
 'event-protest',
 'event-sportsevent',
 'location-GPE',
 'location-bodiesofwater',
 'location-island',
 'location-mountain',
 'location-other',
 'location-park',
 'location-road/railway/highway/transit',
 'organization-company',
 'organization-education',
 'organization-government/governmentagency',
 'organization-media/newspaper',
 'organization-other',
 'organization-politicalparty',
 'organization-religion',
 'organization-showorganization',
 'organization-sportsleague',
 'organization-sportsteam',
 'other-astronomything',
 'other-award',
 'other-biologything',
 'other-chemicalthing',
 'other-currency',
 'o

In [9]:
# Same as before but with fine NER
index_event_election = fine_ner_tags.index("event-election")
index_event_election

17

Closer look at the (training) dataset

In [10]:
train[10]

{'id': '10',
 'tokens': ['The',
  'City',
  'of',
  'Bradenton',
  'talked',
  'A',
  "'s",
  'owner',
  'Charlie',
  'Finley',
  'into',
  'staying',
  'at',
  'McKechnie',
  'until',
  '.'],
 'ner_tags': [0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0],
 'fine_ner_tags': [0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]}

In [11]:
train[10].get("id")

'10'

In [12]:
train[10].get("tokens")

['The',
 'City',
 'of',
 'Bradenton',
 'talked',
 'A',
 "'s",
 'owner',
 'Charlie',
 'Finley',
 'into',
 'staying',
 'at',
 'McKechnie',
 'until',
 '.']

In [13]:
train[10].get("ner_tags")

[0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]

In [14]:
train[10].get("fine_ner_tags")

[0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]

In [15]:
train_10_ner_tags = train[10].get("ner_tags")
print(train_10_ner_tags)

for i in train_10_ner_tags:
    print(ner_tags[i])

[0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]
O
location
location
location
O
O
O
O
O
O
O
O
O
organization
O
O


In [16]:
train_10_fine_ner_tags = train[10].get("fine_ner_tags")
print(train_10_fine_ner_tags)

for i in train_10_fine_ner_tags:
    print(fine_ner_tags[i])

[0, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0]
O
location-GPE
location-GPE
location-GPE
O
O
O
O
O
O
O
O
O
organization-sportsleague
O
O


If we want to work only with fine grained NER tags of a single category

In [17]:
# Only keep the sentences having person as ner tag
ner_tag_index = ner_tags.index("person")

def filter_only_person(example):

    has_ner_tag = False
    for tag in example["ner_tags"]:
        # skip non-entity tokens
        if tag == 0:
            continue
        if tag != ner_tag_index:
            return False
        else:
            has_person = True
    return has_person

# Filter the dataset
filtered_train_person = dataset["train"].filter(filter_only_person)

print("Number of sentences containing only the specified ner tag:", len(filtered_train_person))

Number of sentences containing only the specified ner tag: 9622


In [18]:
filtered_train_person[0]

{'id': '15',
 'tokens': ['Sherman',
  'had',
  'suffered',
  'from',
  'several',
  'health',
  'problems',
  ',',
  'including',
  'kidney',
  'ailments',
  ',',
  'and',
  'injuries',
  'from',
  'a',
  'car',
  'accident',
  'several',
  'weeks',
  'before',
  'his',
  'death',
  'may',
  'have',
  'contributed',
  'to',
  'his',
  'declining',
  'health',
  '.'],
 'ner_tags': [7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'fine_ner_tags': [55,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [19]:
filtered_train_person[0].get("tokens")

['Sherman',
 'had',
 'suffered',
 'from',
 'several',
 'health',
 'problems',
 ',',
 'including',
 'kidney',
 'ailments',
 ',',
 'and',
 'injuries',
 'from',
 'a',
 'car',
 'accident',
 'several',
 'weeks',
 'before',
 'his',
 'death',
 'may',
 'have',
 'contributed',
 'to',
 'his',
 'declining',
 'health',
 '.']

In [20]:
filtered_train_person[0].get("ner_tags")

[7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [21]:
ner_tags[7]

'person'

In [22]:
filtered_train_person[0].get("fine_ner_tags")

[55,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [23]:
fine_ner_tags[21]

'location-GPE'

If we want multiple ner tags (I made this part working with the **inter** dataset, so the second function doesn't work in this case since now)

In [24]:
person_index = ner_tags.index("person")
location_index = ner_tags.index("location")

def filter_person_or_location(example):
    has_valid_ner_tag = False
    for tag in example["ner_tags"]:
        if tag == 0:
            continue
        if tag != person_index and tag != location_index:
            return False
        has_valid_ner_tag = True
    return has_valid_ner_tag # only keep if at least one valid entity tag is found

filtered_train_person_location = dataset["train"].filter(filter_person_or_location)

print("Number of sentences containing only the specified ner tags:", len(filtered_train_person_location))

Number of sentences containing only the specified ner tags: 49134


In [25]:
filtered_train_person_location[0].get("ner_tags")

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 0, 4, 4, 0, 4, 4, 0, 4, 4, 0]

Both train and location

In [26]:
def contains_both_person_and_location(example):
    has_person = False
    has_location = False
    for tag in example["ner_tags"]:
        if tag == person_index:
            has_person = True
        elif tag == location_index:
            has_location = True
        # If both are true
        if has_person and has_location:
            return True
    return False

filtered_both = filtered_train_person_location.filter(contains_both_person_and_location)

print("Number of sentences containing both the ner tags:", len(filtered_both))

Number of sentences containing both the ner tags: 6220


In [27]:
filtered_both[0].get("ner_tags")

[7,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0]

Some Stats

Sentences containing at least one coarse-grained label

In [28]:
# filter to only sentences where at least one token is tagged PERSON
index_person = ner_tags.index("person")

train_person = train.filter(lambda ex: index_person in ex["ner_tags"])

print(f"{len(train_person)} sentences contain a PERSON token")

24148 sentences contain a PERSON token


In [29]:
# filter to only sentences where at least one token is tagged PRODUCT
index_product = ner_tags.index("product")

train_product = train.filter(lambda ex: index_product in ex["ner_tags"])

print(f"{len(train_product)} sentences contain a PRODUCT token")

11359 sentences contain a PRODUCT token


In [30]:
# filter to only sentences where at least one token is tagged ORGANIZATION
index_organization = ner_tags.index("organization")

train_organization = train.filter(lambda ex: index_organization in ex["ner_tags"])

print(f"{len(train_organization)} sentences contain a ORGANIZATION token")

46631 sentences contain a ORGANIZATION token


In [31]:
# percentages
print(len(train_product)/len(train))
print(len(train_person)/len(train))
print(len(train_organization)/len(train))

0.087301709296606
0.18559394982784064
0.35839123216920804


Total number of examples of a specific coarse-grained lable

In [32]:
index_product = ner_tags.index("product")

total_product_labels = sum(tags.count(index_product) for tags in train["ner_tags"])

print(f"Total number of PRODUCT labels in train: {total_product_labels}")

Total number of PRODUCT labels in train: 39653


In [33]:
index_person = ner_tags.index("person")

total_person_labels = sum(tags.count(index_person) for tags in train["ner_tags"])

print(f"Total number of PERSON labels in train: {total_person_labels}")

Total number of PERSON labels in train: 68321


In [34]:
index_organization = ner_tags.index("organization")

total_organization_labels = sum(tags.count(index_organization) for tags in train["ner_tags"])

print(f"Total number of ORGANIZATION labels in train: {total_organization_labels}")

Total number of ORGANIZATION labels in train: 183962


Sentences containing at least one fine-grained label within a coarse-grained one

In [35]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

train_product = train.filter(lambda ex: index_product in ex["ner_tags"])

counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in train_product["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 2409
product-car → 0
product-food → 1080
product-game → 0
product-other → 4841
product-ship → 1450
product-software → 2123
product-train → 0
product-weapon → 0


In [36]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

train_person = train.filter(lambda ex: index_person in ex["ner_tags"])

counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in train_person["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 0
person-artist/author → 10494
person-athlete → 0
person-director → 2403
person-other → 0
person-politician → 9547
person-scholar → 0
person-soldier → 2614


In [37]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

train_organization = train.filter(lambda ex: index_organization in ex["ner_tags"])

counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in train_organization["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 12890
organization-education → 7472
organization-government/governmentagency → 0
organization-media/newspaper → 4109
organization-other → 15225
organization-politicalparty → 0
organization-religion → 0
organization-showorganization → 0
organization-sportsleague → 3296
organization-sportsteam → 8329


Total number of examples of a specific fine-grained lable within a given coarse-grained one

In [38]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

train_product = train.filter(lambda ex: index_product in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in train_product["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 8306
product-car → 0
product-food → 3863
product-game → 0
product-other → 16198
product-ship → 3756
product-software → 7530
product-train → 0
product-weapon → 0


In [39]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

train_person = train.filter(lambda ex: index_person in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in train_person["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 0
person-artist/author → 31553
person-athlete → 0
person-director → 5795
person-other → 0
person-politician → 24898
person-scholar → 0
person-soldier → 6075


In [40]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

train_organization = train.filter(lambda ex: index_organization in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in train_organization["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 41167
organization-education → 33839
organization-government/governmentagency → 0
organization-media/newspaper → 11969
organization-other → 61718
organization-politicalparty → 0
organization-religion → 0
organization-showorganization → 0
organization-sportsleague → 10824
organization-sportsteam → 24445


Stats on DEV (VAL)

In [41]:
# filter to only sentences where at least one token is tagged PERSON
index_person = ner_tags.index("person")

val_person = val.filter(lambda ex: index_person in ex["ner_tags"])

print(f"{len(val_person)} sentences contain a PERSON token")

10381 sentences contain a PERSON token


In [42]:
# filter to only sentences where at least one token is tagged PRODUCT
index_product = ner_tags.index("product")

val_product = val.filter(lambda ex: index_product in ex["ner_tags"])

print(f"{len(val_product)} sentences contain a PRODUCT token")

963 sentences contain a PRODUCT token


In [43]:
# filter to only sentences where at least one token is tagged ORGANIZATION
index_organization = ner_tags.index("organization")

val_organization = val.filter(lambda ex: index_organization in ex["ner_tags"])

print(f"{len(val_organization)} sentences contain a ORGANIZATION token")

1570 sentences contain a ORGANIZATION token


In [44]:
# percentages
print(len(val_product)/len(val))
print(len(val_person)/len(val))
print(len(val_organization)/len(val))

0.051177127065951004
0.5516819896901738
0.08343519158208003


In [45]:
index_product = ner_tags.index("product")

total_product_labels = sum(tags.count(index_product) for tags in val["ner_tags"])

print(f"Total number of PRODUCT labels in val: {total_product_labels}")

Total number of PRODUCT labels in val: 3957


In [46]:
index_person = ner_tags.index("person")

total_person_labels = sum(tags.count(index_person) for tags in val["ner_tags"])

print(f"Total number of PERSON labels in val: {total_person_labels}")

Total number of PERSON labels in val: 29357


In [47]:
index_organization = ner_tags.index("organization")

total_organization_labels = sum(tags.count(index_organization) for tags in val["ner_tags"])

print(f"Total number of ORGANIZATION labels in val: {total_organization_labels}")

Total number of ORGANIZATION labels in val: 5147


In [48]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

val_product = val.filter(lambda ex: index_product in ex["ner_tags"])

counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in val_product["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 0
product-car → 0
product-food → 0
product-game → 582
product-other → 0
product-ship → 0
product-software → 0
product-train → 381
product-weapon → 0


In [49]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

val_person = val.filter(lambda ex: index_person in ex["ner_tags"])

counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in val_person["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 0
person-artist/author → 0
person-athlete → 0
person-director → 0
person-other → 9628
person-politician → 0
person-scholar → 835
person-soldier → 0


In [50]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

val_organization = val.filter(lambda ex: index_organization in ex["ner_tags"])

counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in val_organization["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 0
organization-education → 0
organization-government/governmentagency → 0
organization-media/newspaper → 0
organization-other → 0
organization-politicalparty → 0
organization-religion → 938
organization-showorganization → 638
organization-sportsleague → 0
organization-sportsteam → 0


In [51]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

val_product = val.filter(lambda ex: index_product in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in val_product["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 0
product-car → 0
product-food → 0
product-game → 2384
product-other → 0
product-ship → 0
product-software → 0
product-train → 1573
product-weapon → 0


In [52]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

val_person = val.filter(lambda ex: index_person in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in val_person["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 0
person-artist/author → 0
person-athlete → 0
person-director → 0
person-other → 27328
person-politician → 0
person-scholar → 2029
person-soldier → 0


In [53]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

val_organization = val.filter(lambda ex: index_organization in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in val_organization["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 0
organization-education → 0
organization-government/governmentagency → 0
organization-media/newspaper → 0
organization-other → 0
organization-politicalparty → 0
organization-religion → 2903
organization-showorganization → 2244
organization-sportsleague → 0
organization-sportsteam → 0


Stats on TEST

In [54]:
# filter to only sentences where at least one token is tagged PERSON
index_person = ner_tags.index("person")

test_person = test.filter(lambda ex: index_person in ex["ner_tags"])

print(f"{len(test_person)} sentences contain a PERSON token")

3761 sentences contain a PERSON token


In [55]:
# filter to only sentences where at least one token is tagged PRODUCT
index_product = ner_tags.index("product")

test_product = test.filter(lambda ex: index_product in ex["ner_tags"])

print(f"{len(test_product)} sentences contain a PRODUCT token")

1425 sentences contain a PRODUCT token


In [56]:
# filter to only sentences where at least one token is tagged ORGANIZATION
index_organization = ner_tags.index("organization")

test_organization = test.filter(lambda ex: index_organization in ex["ner_tags"])

print(f"{len(test_organization)} sentences contain a ORGANIZATION token")

1992 sentences contain a ORGANIZATION token


In [57]:
# percentages
print(len(test_product)/len(test))
print(len(test_person)/len(test))
print(len(test_organization)/len(test))

0.10173484686228314
0.2685086028414364
0.14221460698222319


In [58]:
index_product = ner_tags.index("product")

total_product_labels = sum(tags.count(index_product) for tags in test["ner_tags"])

print(f"Total number of PRODUCT labels in test: {total_product_labels}")

Total number of PRODUCT labels in test: 6013


In [59]:
index_person = ner_tags.index("person")

total_person_labels = sum(tags.count(index_person) for tags in test["ner_tags"])

print(f"Total number of PERSON labels in test: {total_person_labels}")

Total number of PERSON labels in test: 13330


In [60]:
index_organization = ner_tags.index("organization")

total_organization_labels = sum(tags.count(index_organization) for tags in test["ner_tags"])

print(f"Total number of ORGANIZATION labels in test: {total_organization_labels}")

Total number of ORGANIZATION labels in test: 8815


In [61]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

test_product = test.filter(lambda ex: index_product in ex["ner_tags"])

counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in test_product["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 0
product-car → 762
product-food → 0
product-game → 0
product-other → 0
product-ship → 0
product-software → 0
product-train → 0
product-weapon → 671


In [62]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

test_person = test.filter(lambda ex: index_person in ex["ner_tags"])

counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in test_person["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 822
person-artist/author → 0
person-athlete → 2945
person-director → 0
person-other → 0
person-politician → 0
person-scholar → 0
person-soldier → 0


In [63]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

test_organization = test.filter(lambda ex: index_organization in ex["ner_tags"])

counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    # count sentences where this fine tag appears at least once
    counts[lbl] = sum(1 for tags in test_organization["fine_ner_tags"] if idx_lbl in tags)

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 0
organization-education → 0
organization-government/governmentagency → 1340
organization-media/newspaper → 0
organization-other → 0
organization-politicalparty → 769
organization-religion → 0
organization-showorganization → 0
organization-sportsleague → 0
organization-sportsteam → 0


In [64]:
index_product = ner_tags.index("product")

product_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("product")]

test_product = test.filter(lambda ex: index_product in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in product_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in test_product["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained product type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained product type:
product-airplane → 0
product-car → 3178
product-food → 0
product-game → 0
product-other → 0
product-ship → 0
product-software → 0
product-train → 0
product-weapon → 2835


In [65]:
index_person = ner_tags.index("person")

person_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("person")]

test_person = test.filter(lambda ex: index_person in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in person_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in test_person["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained person type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained person type:
person-actor → 4142
person-artist/author → 0
person-athlete → 9188
person-director → 0
person-other → 0
person-politician → 0
person-scholar → 0
person-soldier → 0


In [66]:
index_organization = ner_tags.index("organization")

organization_fine_labels = [lbl for lbl in fine_ner_tags if lbl.startswith("organization")]

test_organization = test.filter(lambda ex: index_organization in ex["ner_tags"])

# sum up all occurrences of each fine label
counts = {}
for lbl in organization_fine_labels:
    idx_lbl = fine_ner_tags.index(lbl)
    total_lbl = sum(tags.count(idx_lbl) for tags in test_organization["fine_ner_tags"])
    counts[lbl] = total_lbl

print("Sentence counts per fine‑grained organization type:")
for lbl, cnt in counts.items():
    print(f"{lbl} → {cnt}")

Sentence counts per fine‑grained organization type:
organization-company → 0
organization-education → 0
organization-government/governmentagency → 5464
organization-media/newspaper → 0
organization-other → 0
organization-politicalparty → 3351
organization-religion → 0
organization-showorganization → 0
organization-sportsleague → 0
organization-sportsteam → 0
