In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
import time

# Gemini libraries
from google import genai
from google.genai import types

In [2]:
# To manage environment variables
from google.colab import userdata

**DATASET**

*NER tags*:

- **ORGANIZATION** such as *Georgia-Pacific Corp.*, *WHO
- **PERSON** such as *Eddy Bonte*, *President Obama*.
- **LOCATION** such as *Murray River*, *Mount Everest*.
- **DATE** - such as *June*, *2008-06-29*.
- **TIME** - such as *two fifty a m*, *1:30 p.m.*
- **MONEY** such as *175 million Canadian Dollars*, *GBP 10.40*.
- **PERCENT** such as *twenty pct*, *18.75 %*
- **FACILITY** such as *Washington Monument*, *Stonehenge*.
- **GPE** such as *South East Asia*, *Midlothian*.

In [3]:
# Use your personal account!

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
namefile = '/content/drive/MyDrive/Colab Notebooks/NLP/ner_dataset.csv'

dataset = pd.read_csv(namefile, encoding="utf-8")

print("Dataset head:")
print(dataset.head)
print("\nDataset shape: ", dataset.shape)
print("\nDataset info:")
print(dataset.info())

Dataset head:
<bound method NDFrame.head of                                                     text  \
0      Thousands of demonstrators have marched throug...   
1      Iranian officials say they expect to get acces...   
2      Helicopter gunships Saturday pounded militant ...   
3      They left after a tense hour-long standoff wit...   
4      U.N. relief coordinator Jan Egeland said Sunda...   
...                                                  ...   
47954  Opposition leader Mir Hossein Mousavi has said...   
47955  On Thursday , Iranian state media published a ...   
47956  Following Iran 's disputed June 12 elections ,...   
47957  Since then , authorities have held public tria...   
47958  The United Nations is praising the use of mili...   

                                                  labels  
0      O O O O O O B-geo O O O O O B-geo O O O O O B-...  
1      B-gpe O O O O O O O O O O O O O O B-tim O O O ...  
2      O O B-tim O O O O O B-geo O O O O O B-org O O ...  

In [5]:
# Load the dataset

total_rows = len(pd.read_csv(namefile, encoding="utf-8"))
pbar = tqdm(total=total_rows)

# Create a new np array to store the extracted sentences
dataset_array = []

chunksize = 10000

# Open file
for chunk in pd.read_csv(namefile, sep=',', encoding="utf-8", chunksize=chunksize):

    # Manage NaN values
    chunk = chunk.fillna('')

    for index, row in chunk.iterrows():

      dataset_array.append((row['text'], row['labels']))

    pbar.update(chunksize)

pbar.close()

# Print out some info

print("\n\nShape of the array: ", len(dataset_array))

print("An element from the dataset: ", dataset_array[0])

50000it [00:09, 5169.26it/s]



Shape of the array:  47959
An element from the dataset:  ('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O')





In [6]:
# Check the NAR tags in the dataset
unique_labels = set()

for (sentence, label) in dataset_array:
  for tag in label.split(" "):
     unique_labels.add(tag)

print("All unique labels: ", unique_labels)

All unique labels:  {'B-org', 'I-tim', 'I-art', 'O', 'B-gpe', 'B-per', 'I-eve', 'B-art', 'B-nat', 'B-tim', 'I-org', 'I-nat', 'B-geo', 'I-gpe', 'I-per', 'B-eve', 'I-geo'}


In [7]:
# Keep only examples with at least one relevant NER tag

for (sentence, label) in dataset_array:
  num_tags = set()
  for tag in label.split(" "):
     num_tags.add(tag)
  if not len(num_tags) > 1:
    dataset_array.remove((sentence, label))

print("Dataset len: ", len(dataset_array))

Dataset len:  42190


In [8]:
import random
print("Dataset len: ", len(dataset_array))

# Random seed for reproducibility
random.seed(2121346)

dataset_sentences_array = []
dataset_labels_array = []

dataset_sentence_examples = []
dataset_label_examples = []

# Shuffle array with random seed
random.shuffle(dataset_array)

print("An element from the dataset after the shuffle: ", dataset_array[0])

temp_array = dataset_array[:15]
ex_array = dataset_array[16:31]

# Slicing 15

for (x,y) in temp_array:

  dataset_sentences_array.append(x)
  dataset_labels_array.append(y)

for (x,y) in ex_array:

  dataset_sentence_examples.append(x)
  dataset_label_examples.append(y)

print("Len sentence array: ", len(dataset_sentences_array))
print("Len true label array: ", len(dataset_labels_array))

print("Len sentence examples array: ", len(dataset_sentence_examples))
print("Len true label examples array: ", len(dataset_label_examples))

del temp_array, ex_array, dataset_array

Dataset len:  42190
An element from the dataset after the shuffle:  ('Some African heads of state plan to push for a united continent during an African Union summit that begins Sunday in the Ghanaian capital of Accra .', 'O B-gpe O O O O O O O O O O O O B-geo I-geo O O O B-tim O O B-gpe O O B-geo O')
Len sentence array:  15
Len true label array:  15
Len sentence examples array:  15
Len true label examples array:  15


**Gemini** for NER *(Name Entity Recognition)* task

In [9]:
# Keep only the words releated to a NER tag

print("A sentence before the operation: ", dataset_sentences_array[1])
print("A true label before the operation: ", dataset_labels_array[1])

wordpair_sentence_label = []

for i in range(0, len(dataset_labels_array)):
  wordpair_sentence = []
  sentence = dataset_sentences_array[i]
  label = dataset_labels_array[i]
  for tag in range(0, len(label.split(" "))):
    if (label.split(" "))[tag] != 'O':
      wordpair_sentence.append(((sentence.split(" "))[tag], (label.split(" "))[tag]))
  wordpair_sentence_label.append(wordpair_sentence)

print("A sentence after the operation: ", wordpair_sentence_label[1])

print("A sentence before the operation: ", dataset_sentence_examples[1])
print("A true label before the operation: ", dataset_label_examples[1])

wordpair_sentence_label_example = []

for i in range(0, len(dataset_label_examples)):
  wordpair_sentence = []
  sentence = dataset_sentence_examples[i]
  label = dataset_label_examples[i]
  for tag in range(0, len(label.split(" "))):
    if (label.split(" "))[tag] != 'O':
      wordpair_sentence.append(((sentence.split(" "))[tag], (label.split(" "))[tag]))
  wordpair_sentence_label_example.append(wordpair_sentence)

print("A sentence after the operation: ", wordpair_sentence_label_example[1])

A sentence before the operation:  Gunmen in the Niger Delta regularly attack oil company facilities and kidnap their staff .
A true label before the operation:  O O O B-gpe B-org O O O O O O O O O O
A sentence after the operation:  [('Niger', 'B-gpe'), ('Delta', 'B-org')]
A sentence before the operation:  The government is attempting to diversify its industry and trade and has signed an Association Agreement with the EU to expand business there .
A true label before the operation:  O O O O O O O O O O O O O O B-org I-org O O B-org O O O O O
A sentence after the operation:  [('Association', 'B-org'), ('Agreement', 'I-org'), ('EU', 'B-org')]


In [10]:
# Import for regular expression
import re

# Analysis of the performance
"""
This function calculate the value of:
- true positive (tp).
- false positive (fp).
- false negative (fn).

Returns: tp, fp, fn
"""
def metrics (predicted, true):

  tp = 0
  fp = 0
  fn = 0

  # Pattern that finds all the tuples inside the response
  predicted_tuple = re.findall(r'["\'](.*?)["\']\s*,\s*["\'](.*?)["\']', predicted)

  # Check if there are composed tuples and scompose them if present -> ex. ["New Year", "B-eve"] to ["New", "B-eve"], ["Year", "I-eve"]
  for (x,y) in predicted_tuple:
    if len(x.split(" ")) > 1:
      predicted_tuple.remove((x,y))
      # Get the tag (last part)
      y = y.split("-")[1]
      first_word = True
      for word in x.split(" "):
        # First B-y, other I-y
        if first_word:
          first_word = False
          predicted_tuple.append((word, "B-" + y))
        else:
          predicted_tuple.append((word, "I-" + y))

  print("Predicted vector after normalization: ", predicted_tuple)

  # Calculate the true positive (tp) and the false positive (fp)
  temp_true = true.copy()
  for tup in predicted_tuple:
    if tup in temp_true:
      tp += 1
      temp_true.remove(tup)
    else:
      fp += 1

  # Calculate the false negative (fn)
  for tup in true:
    if tup not in predicted_tuple:
      fn += 1

  return tp, fp, fn

In [11]:
# Get the API key from the environment of Google Colab (aka Secrets)

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

In [12]:
tp, fp, fn = 0, 0, 0

for i in range (0, 15):

  #TODO:
  # Prompt for the NER task
  prompt = """Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Time (tim).
    - Location (geo).
    - Geo-Political Entity (gpe).
    - Work of Art (art).
    - Event (eve).
    - Organization (org).
    - Person (per).
    - Nationality (nat).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-per" and not just "per").

  First you split the phrase using the character white space, then you associate a tag to every word, and then you keep only the words with a
  relevant tag (the ones I've listed before). After that you report the result using tuples of word-tag like: ("Albert", "per").
  The result must be in a list of tuples.

  Answer with less words as possible.

  Your phrase is: """ + dataset_sentences_array[i] + """

  Result:
  """

  print(prompt)

  response = client.models.generate_content(
      model="gemini-2.0-flash",
      contents=prompt,
  )

  # response = response.text
  print(response.text)


  tem_tp, tem_fp, tem_fn = metrics(response.text, wordpair_sentence_label[i])

  print("\nTure vector [" + str(wordpair_sentence_label[i]) + "]")
  print("tp: ", tem_tp)
  print("fp: ", tem_fp)
  print("fn: ", tem_fn)

  tp += tem_tp
  fp += tem_fp
  fn += tem_fn

  time.sleep(6)

  #break

Do the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  The only entity labels that could appear in the phrase are:
    - Time (tim).
    - Location (geo).
    - Geo-Political Entity (gpe).
    - Work of Art (art).
    - Event (eve).
    - Organization (org).
    - Person (per).
    - Nationality (nat).
  You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-per" and not just "per").

  First you split the phrase using the character white space, then you associate a tag to every word, and then you keep only the words with a
  relevant tag (the ones I've listed before). After that you report the result using tuples of word-tag like: ("Albert", "per").
  The result must be in a list of tuples.

  Answer with less words as possible.

  Your phrase is: Some African heads of state plan to push for a united continent during an African Union summit that begins Sunday in the Ghanaian capital o

In [13]:
# Analysis of the response using the true label
precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

print("Results: ")
print("Recall: ", recall)
print("Precision: ", precision)
print("F1: ", f1)

Results: 
Recall:  0.4888888888888889
Precision:  0.43137254901960786
F1:  0.45833333333333326


In [14]:
  # prompt_bozza = """You should perform the Name Entity Recognition (also known as NER) task to a following phrase I will give you. You should
  # find all the instance belongings to the following classes using the following tags:

  # - tim: refers to specific times within a day or durations (example: "5 PM," "midnight," "two hours").
  # - geo: refers to geographic places (example: "London", "Paris", "California").
  # - gpe: refers to geographical regions that are also political entities (example: "United States," "Germany," "China").
  # - art: refers to art shuch as painting, sculture, opera, songs (example: "Monna Lisa", "David of Michelangelo", "L'Aida of Verdi").
  # - eve: refers to named occurrences such as wars, sports events, disasters (example: "II World War", "Sanremo's Festival", "Troy War").
  # - org: refers to names of companies, institutions, agencies, or other groups of people (example: "Google," "United Nations," "Harvard University").
  # - per: refers to names of people or fictional characters (example: "Albert Einstein," "Marie Curie," "Sherlock Holmes").
  # - nat: refers to names of ethnic groups or national adjectives (example: "Italian", "American", "Pakistani").

  # Also use the prefixes B (for begin) and I (for inside) together with a tag in the list ahead (for example use "B-per" and not just "per").

  # First you split the phrase using the character white space, then you associate a tag to every word, and then you keep only the words with a
  # relevant tag (see the list ahead). After that you report the result using tuples of word-tag.

  # Before we get started I give you an example of what you should do:
  # - First you receive an input string: """ + dataset_sentence_examples[i] + """
  # - Then you split words using the white space and you get: """ + str(dataset_sentence_examples[i].split(" ")) + """
  # - Then you associate a tag to every word: """ + str(dataset_label_examples[i].split(" ")) + """
  # - Then you ignore all the tags that are not relevant ("O" tag): """ + str([x for x in dataset_label_examples[i].split(" ") if x != "O"]) + """
  # - Finally you present the result using tuples: [""" + str(wordpair_sentence_label_example[i]) + """]

  # Answer with less words as possible.

  # Your phrase is: """ + dataset_sentences_array[i] + """

  # Result:
  # """


  #   prompt = """You should perform the Name Entity Recognition (also known as NER) task to a following phrase I will give you.
  # The entity labels that could appear in the phrase are:

  # - tim.
  # - geo.
  # - gpe.
  # - art.
  # - eve.
  # - org.
  # - per.
  # - nat.

  # You have to use prefixes B (for begin) or I (for inside) together with a tag in the list ahead (for example use "B-per" and not just "per").

  # First you split the phrase using the character white space, then you associate a tag to every word, and then you keep only the words with a
  # relevant tag (the ones I've listed before). After that you report the result using tuples of word-tag like: ("Albert", "per").

  # Answer with less words as possible.

  # Your phrase is: """ + dataset_sentences_array[i] + """

  # Result:
  # """