In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv

# Gemini libraries
from google import genai
from google.genai import types

In [2]:
# To manage environment variables
from google.colab import userdata

**DATASET**

*NER tags*:

- **ORGANIZATION** such as *Georgia-Pacific Corp.*, *WHO
- **PERSON** such as *Eddy Bonte*, *President Obama*.
- **LOCATION** such as *Murray River*, *Mount Everest*.
- **DATE** - such as *June*, *2008-06-29*.
- **TIME** - such as *two fifty a m*, *1:30 p.m.*
- **MONEY** such as *175 million Canadian Dollars*, *GBP 10.40*.
- **PERCENT** such as *twenty pct*, *18.75 %*
- **FACILITY** such as *Washington Monument*, *Stonehenge*.
- **GPE** such as *South East Asia*, *Midlothian*.

In [3]:
# Use your personal account!

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
namefile = '/content/drive/MyDrive/Colab Notebooks/NLP/ner_dataset.csv'

dataset = pd.read_csv(namefile, encoding="utf-8")

print("Dataset head:")
print(dataset.head)
print("\nDataset shape: ", dataset.shape)
print("\nDataset info:")
print(dataset.info())

Dataset head:
<bound method NDFrame.head of                                                     text  \
0      Thousands of demonstrators have marched throug...   
1      Iranian officials say they expect to get acces...   
2      Helicopter gunships Saturday pounded militant ...   
3      They left after a tense hour-long standoff wit...   
4      U.N. relief coordinator Jan Egeland said Sunda...   
...                                                  ...   
47954  Opposition leader Mir Hossein Mousavi has said...   
47955  On Thursday , Iranian state media published a ...   
47956  Following Iran 's disputed June 12 elections ,...   
47957  Since then , authorities have held public tria...   
47958  The United Nations is praising the use of mili...   

                                                  labels  
0      O O O O O O B-geo O O O O O B-geo O O O O O B-...  
1      B-gpe O O O O O O O O O O O O O O B-tim O O O ...  
2      O O B-tim O O O O O B-geo O O O O O B-org O O ...  

In [5]:
# Load the dataset

total_rows = len(pd.read_csv(namefile, encoding="utf-8"))
pbar = tqdm(total=total_rows)

# Create a new np array to store the extracted sentences
dataset_array = []

chunksize = 10000

# Open file
for chunk in pd.read_csv(namefile, sep=',', encoding="utf-8", chunksize=chunksize):
    # Manage NaN values
    # chunk = chunk.fillna('')

    for index, row in chunk.iterrows():

        # Save string and true label
        dataset_array.append((row['text'], row['labels']))

    pbar.update(chunksize)

pbar.close()

# Print out some info

print("\nShape of the array: ", len(dataset_array))

print("An element from the dataset: ", dataset_array[0])

50000it [00:07, 6675.74it/s]


Shape of the array:  47959
An element from the dataset:  ('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O')





In [6]:
import random

# Random seed for reproducibility (seed = 77)
random.seed(77)

dataset_sentences_array = []
dataset_labels_array = []

dataset_sentence_examples = []
dataset_label_examples = []

# Shuffle array with random seed
random.shuffle(dataset_array)

print("An element from the dataset after the shuffle: ", dataset_array[0])

temp_array = dataset_array[:15]
ex_array = dataset_array[16:31]

# Slicing 15

for (x,y) in temp_array:

  dataset_sentences_array.append(x)
  dataset_labels_array.append(y)

for (x,y) in ex_array:

  dataset_sentence_examples.append(x)
  dataset_label_examples.append(y)

print("Len sentence array: ", len(dataset_sentences_array))
print("Len true label array: ", len(dataset_labels_array))

print("Len sentence examples array: ", len(dataset_sentence_examples))
print("Len true label examples array: ", len(dataset_label_examples))

del temp_array, ex_array, dataset_array

An element from the dataset after the shuffle:  ('Heilongjiang has the largest cultivated land area in China along with heavy industry and timber resources .', 'B-geo O O O O O O O B-geo O O O O O O O O')
Len sentence array:  15
Len true label array:  15
Len sentence examples array:  15
Len true label examples array:  15


**Gemini** for NER *(Name Entity Recognition)* task

In [7]:
# Keep only the words releated to a NER tag

print("A sentence before the operation: ", dataset_sentences_array[1])
print("A true label before the operation: ", dataset_labels_array[1])

wordpair_sentence_label = []

for i in range(0, len(dataset_labels_array)):
  wordpair_sentence = []
  sentence = dataset_sentences_array[i]
  label = dataset_labels_array[i]
  for tag in range(0, len(label.split(" "))):
    if (label.split(" "))[tag] != 'O':
      wordpair_sentence.append(((sentence.split(" "))[tag], (label.split(" "))[tag]))
  wordpair_sentence_label.append(wordpair_sentence)

print("A sentence after the operation: ", wordpair_sentence_label[1])

print("A sentence before the operation: ", dataset_sentence_examples[1])
print("A true label before the operation: ", dataset_label_examples[1])

wordpair_sentence_label_example = []

for i in range(0, len(dataset_label_examples)):
  wordpair_sentence = []
  sentence = dataset_sentence_examples[i]
  label = dataset_label_examples[i]
  for tag in range(0, len(label.split(" "))):
    if (label.split(" "))[tag] != 'O':
      wordpair_sentence.append(((sentence.split(" "))[tag], (label.split(" "))[tag]))
  wordpair_sentence_label_example.append(wordpair_sentence)

print("A sentence after the operation: ", wordpair_sentence_label_example[1])

A sentence before the operation:  United Nations Secretary-General Kofi Annan has urged Mr. Banny 's government to immediately begin implementing a transition plan designed to lead to new elections by next October .
A true label before the operation:  B-org I-org B-per I-per I-per O O B-per I-per O O O O O O O O O O O O O O O O O B-tim O
A sentence after the operation:  [('United', 'B-org'), ('Nations', 'I-org'), ('Secretary-General', 'B-per'), ('Kofi', 'I-per'), ('Annan', 'I-per'), ('Mr.', 'B-per'), ('Banny', 'I-per'), ('October', 'B-tim')]
A sentence before the operation:  A winter storm in the northeastern United States has shut down airline travel and cut power to thousands of people .
A true label before the operation:  O O O O O O B-geo I-geo O O O O O O O O O O O O O
A sentence after the operation:  [('United', 'B-geo'), ('States', 'I-geo')]


In [8]:
# Get the API key from the environment of Google Colab (aka Secrets)

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

In [10]:
response = ""
for i in range (0, 15):

  #TODO:
  # Prompt for the NER task
  prompt = """You should perform the Name Entity Recognition (also known as NER)
  to the following phrase I will give to you. You should
  find all the instance belongings to the following classes:
  - ORGANIZATION such as Georgia-Pacific Corp.
  - PERSON such as Eddy Bonte, President Obama.
  - LOCATION such as Murray River, Mount Everest.
  - DATE such as June, 2008-06-29.
  - TIME such as two fifty a m, 1:30 p.m.
  - MONEY such as 175 million Canadian Dollars, GBP 10.40.
  - PERCENT such as twenty pct, 18.75 %
  - FACILITY such as Washington Monument, Stonehenge.
  - GPE such as South East Asia, Midlothian.

  No other tags shold be find. The result should be presented as a vector where
  O is a word with no NER tag, and the others with its proper tag.
  Each word should have its associated tag

  Before we get started I give you an example:
  - Input string: """ + dataset_sentence_examples[i] + """
  - The output should be: [""" + str(wordpair_sentence_label_example[i]) + """]

  Your phrase is: """ + dataset_sentences_array[i]

  print(prompt)

  print("\n\n\n\n")

  response = client.models.generate_content(
      model="gemini-2.0-flash",
      contents=prompt,
  )

  # response = response.text
  print(response.text)

  print("Ture vector [" + str(wordpair_sentence_label[i]) + "]")

  break

  # Analysis of the response using the true label

You should perform the Name Entity Recognition (also known as NER)
  to the following phrase I will give to you. You should
  find all the instance belongings to the following classes:
  - ORGANIZATION such as Georgia-Pacific Corp.
  - PERSON such as Eddy Bonte, President Obama.
  - LOCATION such as Murray River, Mount Everest.
  - DATE such as June, 2008-06-29.
  - TIME such as two fifty a m, 1:30 p.m.
  - MONEY such as 175 million Canadian Dollars, GBP 10.40.
  - PERCENT such as twenty pct, 18.75 %
  - FACILITY such as Washington Monument, Stonehenge.
  - GPE such as South East Asia, Midlothian.

  No other tags shold be find. The result should be presented as a vector where 
  O is a word with no NER tag, and the others with its proper tag.
  Each word should have its associated tag

  Before we get started I give you an example:
  - Input string: Turkey , the United States and other nations have designated the PKK a terrorist group .
  - The output should be: [[('Turkey', 'B-org'),