In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
import time

# Gemini libraries
from google import genai
from google.genai import types

In [2]:
# To manage environment variables
from google.colab import userdata

In [3]:
# To download the dataset
import tensorflow_datasets as tfds

In [4]:
# Download
ds, info = tfds.load("conll2003", split=['train', 'dev', 'test'], with_info=True)
train_ds, val_ds, test_ds = ds
print(info)

tfds.core.DatasetInfo(
    name='conll2003',
    full_name='conll2003/conll2003/1.0.0',
    description="""
    The shared task of CoNLL-2003 concerns language-independent named entity
    recognition and concentrates on four types of named entities: persons,
    locations, organizations and names of miscellaneous entities that do not belong
    to the previous three groups.
    """,
    homepage='https://www.aclweb.org/anthology/W03-0419/',
    data_dir='/root/tensorflow_datasets/conll2003/conll2003/1.0.0',
    file_format=tfrecord,
    download_size=959.94 KiB,
    dataset_size=3.87 MiB,
    features=FeaturesDict({
        'chunks': Sequence(ClassLabel(shape=(), dtype=int64, num_classes=23)),
        'ner': Sequence(ClassLabel(shape=(), dtype=int64, num_classes=9)),
        'pos': Sequence(ClassLabel(shape=(), dtype=int64, num_classes=47)),
        'tokens': Sequence(Text(shape=(), dtype=string)),
    }),
    supervised_keys=None,
    disable_shuffling=False,
    nondeterministic_ord

In [5]:
for example in train_ds.take(3):
    print("Tokens: ", example["tokens"].numpy())
    print("NER tags:", example["ner"].numpy())

Tokens:  [b'"' b'If' b'they' b"'re" b'saying' b'at' b'least' b'20' b'percent' b','
 b'then' b'their' b'internal' b'forecasts' b'are' b'probably' b'saying'
 b'25' b'or' b'30' b'percent' b',' b'"' b'said' b'one' b'Sydney' b'media'
 b'analyst' b'who' b'declined' b'to' b'be' b'named' b'.']
NER tags: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0]
Tokens:  [b'Lauck' b"'s" b'lawyer' b'vowed' b'he' b'would' b'appeal' b'against'
 b'the' b'court' b"'s" b'decision' b',' b'arguing' b'that' b'his'
 b'client' b'should' b'have' b'been' b'set' b'free' b'because' b'he'
 b'had' b'not' b'committed' b'any' b'offence' b'under' b'German' b'law'
 b'.']
NER tags: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0]
Tokens:  [b'Thailand' b"'s" b'powerful' b'military' b'thinks' b'the' b'government'
 b'is' b'dishonest' b'and' b'Prime' b'Minister' b'Banharn' b'Silpa-archa'
 b"'s" b'resignation' b'might' b'solve' b'the' b'nation' b"'s"
 b'political' b'and' b'economic' b'woes' 

**DATASET**

*NER tags*:

- **ORGANIZATION** such as *Georgia-Pacific Corp.*, *WHO
- **PERSON** such as *Eddy Bonte*, *President Obama*.
- **LOCATION** such as *Murray River*, *Mount Everest*.
- **MISCELLANEOUS** - Miscellaneous entities such as *events*, *nationalities*, *products*, or *works of art*.

In [6]:
ner_label_names = info.features["ner"].names
print(ner_label_names)  # ['O', 'B-PER', 'I-PER', 'B-ORG', ...]

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [7]:
sentences = []

# Iterate through the dataset correctly
for example in train_ds.take(3):
    # Access the tokens feature within each example
    tokens = example["tokens"].numpy().astype(str)
    sentence = " ".join(tokens)
    sentences.append(sentence)

In [8]:
for s in sentences:
  print("Example of a phrase: ", s)

Example of a phrase:  " If they 're saying at least 20 percent , then their internal forecasts are probably saying 25 or 30 percent , " said one Sydney media analyst who declined to be named .
Example of a phrase:  Lauck 's lawyer vowed he would appeal against the court 's decision , arguing that his client should have been set free because he had not committed any offence under German law .
Example of a phrase:  Thailand 's powerful military thinks the government is dishonest and Prime Minister Banharn Silpa-archa 's resignation might solve the nation 's political and economic woes , an opinion poll showed on Thursday .


In [9]:
import random

In [10]:
# Hyperparameter for the minumum number of NER tags that an example should have
# to be taken into consideration.
min_num_tags = 4

In [11]:
# Random shuffle of the examples in the training set

dataset_array = []

for example in train_ds:
  # Count the number of NER tags in the example
  num_tags = 0
  for tag in example["ner"].numpy():
    if tag != 0:
      num_tags += 1
  if num_tags >= min_num_tags:
    # Build the example
    tokens = example["tokens"].numpy().astype(str)
    sentence = " ".join(tokens)
    # Build the NER tag array
    ner_tags = example["ner"].numpy()
    ner_tag_array = []
    for tag in ner_tags:
      ner_tag_array.append(ner_label_names[tag])
    dataset_array.append((sentence, ner_tag_array))

# Print an example
print("Example: ", dataset_array[0])

random.seed(77)

# Shuffle with the random seed
random.shuffle(dataset_array)

# Keep only the first 60 examples
dataset_array = dataset_array[:60]

dataset_sentences_array = []
dataset_labels_array = []

# Create the two arrays for labels and examples
for (x, y) in dataset_array:
  dataset_sentences_array.append(x)
  dataset_labels_array.append(y)

# Print the length
print("Len sentence array: ", len(dataset_sentences_array))
print("Len true label array: ", len(dataset_labels_array))

Example:  ("A forensic scientist who examined the supposed skull of 19th century King Hintsa , a chief of President Nelson Mandela 's Xhosa tribe killed in battle by the British , said it was in fact the cranium of a European woman .", ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])
Len sentence array:  60
Len true label array:  60


In [12]:
# Keep only the words releated to a NER tag

print("A sentence before the operation: ", dataset_sentences_array[1])
print("A true label before the operation: ", dataset_labels_array[1])

wordpair_sentence_label = []

for i in range(0, len(dataset_labels_array)):
  wordpair_sentence = []
  sentence = dataset_sentences_array[i]
  label = dataset_labels_array[i]
  for i in range (0, len(label)):
    if label[i] != 'O':
      wordpair_sentence.append(((sentence.split(" "))[i], label[i]))
  wordpair_sentence_label.append(wordpair_sentence)

print("A sentence after the operation: ", wordpair_sentence_label[1])

A sentence before the operation:  144 Peter O'Malley ( Australia ) 71 73 , Costantino Rocca ( Italy )
A true label before the operation:  ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O']
A sentence after the operation:  [('Peter', 'B-PER'), ("O'Malley", 'I-PER'), ('Australia', 'B-LOC'), ('Costantino', 'B-PER'), ('Rocca', 'I-PER'), ('Italy', 'B-LOC')]


**Gemini** for NER *(Name Entity Recognition)* task

In [13]:
# Import for regular expression
import re

# Analysis of the performance
"""
This function calculate the value of:
- true positive (tp).
- false positive (fp).
- false negative (fn).

Returns: tp, fp, fn
"""
def metrics (predicted, true):

  tp = 0
  fp = 0
  fn = 0

  # Pattern that finds all the tuples inside the response
  predicted_tuple = re.findall(r'["\'](.*?)["\']\s*,\s*["\'](.*?)["\']', predicted)

  # Check if there are composed tuples and scompose them if present -> ex. ["New Year", "B-eve"] to ["New", "B-eve"], ["Year", "I-eve"]
  for (x,y) in predicted_tuple:
    if len(x.split(" ")) > 1:
      predicted_tuple.remove((x,y))
      # Get the tag (last part)
      y = y.split("-")[1]
      first_word = True
      for word in x.split(" "):
        # First B-y, other I-y
        if first_word:
          first_word = False
          predicted_tuple.append((word, "B-" + y))
        else:
          predicted_tuple.append((word, "I-" + y))

  print("Predicted vector after normalization: ", predicted_tuple)

  # Calculate the true positive (tp) and the false positive (fp)
  temp_true = true.copy()
  for tup in predicted_tuple:
    if tup in temp_true:
      tp += 1
      temp_true.remove(tup)
    else:
      fp += 1

  # Calculate the false negative (fn)
  for tup in true:
    if tup not in predicted_tuple:
      fn += 1

  return tp, fp, fn

In [14]:
# Get the API key from the environment of Google Colab (aka Secrets)

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

In [20]:
tp, fp, fn = 0, 0, 0

# for i in range (0, len(dataset_sentences_array)):
for i in range (0, 30):

  # Prompt for the NER task
  prompt = """Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Location (LOC).
    - Organization (ORG).
    - Person (PER).
    - Miscellaneos (MISC).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-PER" and not just "PER").

  You must keep only the words with a relevant tag (the ones I've listed before).

  The result must be in a list of tuples of word-tag like: ("Albert", "B-PER").

  Answer with less words as possible.

  Your phrase is: """ + dataset_sentences_array[i] + """

  Result:
  """

  print(prompt)

  try:
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
    )

    # response = response.text
    print(response.text)


    tem_tp, tem_fp, tem_fn = metrics(response.text, wordpair_sentence_label[i])

    print("\nTure vector [" + str(wordpair_sentence_label[i]) + "]")
    print("tp: ", tem_tp)
    print("fp: ", tem_fp)
    print("fn: ", tem_fn)

    tp += tem_tp
    fp += tem_fp
    fn += tem_fn

    # Sleep to not exeed the API limit
    time.sleep(6)

  # An error occured (probably Gemini is overloaded), so wait 15 seconds
  except Exception as e:
    print(f"An error occurred for sentence {i}: {e}")
    time.sleep(15)

    # Try again
    i = i - 1

Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Location (LOC).
    - Organization (ORG).
    - Person (PER).
    - Miscellaneos (MISC).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-PER" and not just "PER").

  You must keep only the words with a relevant tag (the ones I've listed before).

  The result must be in a list of tuples of word-tag like: ("Albert", "B-PER").

  Answer with less words as possible.

  Your phrase is: Scorers : Ewald Brenner ( 5th minute ) , Mario Stieglmair ( 42nd ) , Ronald Brunmayr ( 43rd and 56th ) .

  Result:
  
[("Ewald", "B-PER"), ("Brenner", "I-PER"), ("Mario", "B-PER"), ("Stieglmair", "I-PER"), ("Ronald", "B-PER"), ("Brunmayr", "I-PER")]

Predicted vector after normalization:  [('Ewald', 'B-PER'), ('Brenner', 'I-PER'), ('Mario', 'B-PER'), ('Stieglmair', 'I-PER'), ('Ro

In [21]:
# Analysis of the response using the true label
precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

print("Results: ")
print("Recall: ", recall)
print("Precision: ", precision)
print("F1: ", f1)

Results: 
Recall:  0.8951048951048951
Precision:  0.9411764705882353
F1:  0.9175627240143368


In [22]:
# Auto write prompt for NER task

autowrite = """
If you have to do the NER (Name Entity Recognition) task of a given phrase (I give you the phrase) in zero-shot environment (you don't see any
data except the input phrase) and you should describe the best possible metodology that can help you to do such task in order to get the highest
possible accuracy, how do you describe that?

Consider that the dataset is already choosen and you should do only the NER task of a phrase, nothing else, but you have to achive the highest
accuracy possible.

Write down only a bullet point or a numbered list of tasks that can help you to perform the NER task . This list will be the prompt for the NER task, so no other
generated content is allowed because the prompt must be as much clear as possible.

Remember: you should describe a metodology that can help you (gemini 2.0-flash) to perform the task.

Metodology:
"""

autowrite_response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=autowrite,
)

print(autowrite_response.text)


Here's the methodology as a bullet point list, designed to guide a zero-shot NER task for a single phrase with maximum accuracy:

*   **Decompose the Phrase:** Break the input phrase into individual words or sub-phrases.

*   **Entity Type Priming:** Before processing, explicitly define the entity types that are relevant to the target dataset. Example: "Recognize PERSON, ORGANIZATION, LOCATION, DATE, TIME, and MISC entities."

*   **Prompt Engineering with Examples (Illustrative):** Construct prompts that include *hypothetical* examples of NER applied to similar (but distinct) phrases. The examples would demonstrate the desired output format and NER style.

*   **Leverage Contextual Clues:** Focus on identifying contextual cues within the phrase that indicate entity types (e.g., prepositions like "in" often precede LOCATION, titles like "Dr." precede PERSON).

*   **Iterative Refinement:** Start with a general prompt. Analyze the initial output and then refine the prompt to address any

In [25]:
# Try the auto write prompt for NER task

tp, fp, fn = 0, 0, 0

# for i in range (0, len(dataset_sentences_array)):
for i in range (0, 30):

  # Prompt for the NER task
  prompt = """Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Location (LOC).
    - Organization (ORG).
    - Person (PER).
    - Miscellaneos (MISC).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-PER" and not just "PER").

  """ + autowrite_response.text + """

  You must keep only the words with a relevant tag (the ones I've listed before).

  The result must be in a list of tuples of word-tag like: ("Albert", "B-PER").

  Answer with less words as possible.

  Your phrase is: """ + dataset_sentences_array[i] + """

  Result:
  """

  print(prompt)

  try:
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
    )

    print(response.text)


    tem_tp, tem_fp, tem_fn = metrics(response.text, wordpair_sentence_label[i])

    print("\nTure vector [" + str(wordpair_sentence_label[i]) + "]")
    print("tp: ", tem_tp)
    print("fp: ", tem_fp)
    print("fn: ", tem_fn)

    tp += tem_tp
    fp += tem_fp
    fn += tem_fn

    # Sleep to not exeed the API limit
    time.sleep(6)

  # An error occured (probably Gemini is overloaded), so wait 15 seconds
  except Exception as e:
    print(f"An error occurred for sentence {i}: {e}")
    time.sleep(15)

    # Try again
    i = i - 1

Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Location (LOC).
    - Organization (ORG).
    - Person (PER).
    - Miscellaneos (MISC).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-PER" and not just "PER").

  Here's the methodology as a bullet point list, designed to guide a zero-shot NER task for a single phrase with maximum accuracy:

*   **Decompose the Phrase:** Break the input phrase into individual words or sub-phrases.

*   **Entity Type Priming:** Before processing, explicitly define the entity types that are relevant to the target dataset. Example: "Recognize PERSON, ORGANIZATION, LOCATION, DATE, TIME, and MISC entities."

*   **Prompt Engineering with Examples (Illustrative):** Construct prompts that include *hypothetical* examples of NER applied to similar (but distinct) phrases. The examp

In [26]:
# Analysis of the response using the true label
precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

print("Results: ")
print("Recall: ", recall)
print("Precision: ", precision)
print("F1: ", f1)

Results: 
Recall:  0.9044585987261147
Precision:  0.9102564102564102
F1:  0.9073482428115015
