# Token classification with OpenAI GPT models

In [1]:
from typing import List

from datasets import load_dataset
import pandas as pd
import numpy as np

from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

import os
from tqdm import tqdm
import json

In [2]:
dataset = load_dataset("GEODE/GeoEDdA")

In [3]:
dfs = []
for key in dataset.keys():
    dfs.append(pd.DataFrame({'dataset':key, 'text':dataset[key]['text'], 'meta':dataset[key]['meta'], 'tokens':dataset[key]['tokens'], 'spans':dataset[key]['spans']}))
df = pd.concat(dfs, ignore_index=True)

In [4]:
def filter_ner_io(sentence, tagset):
    result=['O']*len(sentence['tokens'])
    for span in sentence['spans']:
        if(span['label'] in tagset):
            for i in range(span['token_start'],span['token_end']+1):
                if(result[i]=='O' or span['label']=='Latlong'):
                    result[i]=span['label']
                elif(result[i]==['Latlong']):
                    break
    return result

In [5]:
tagset = ['Domain-mark','Head','NC-Person','NC-Spatial','NP-Misc','NP-Person','NP-Spatial','Relation','Latlong']

df['ner_io'] = df.apply(lambda x: filter_ner_io(x, tagset), axis=1)

df_train = df[df['dataset']=='train'].reset_index(drop = True)
df_val = df[df['dataset']=='validation'].reset_index(drop = True)
df_test = df[df['dataset']=='test'].reset_index(drop = True)

In [6]:
examples_set_train = []
for index,row in df_train.iterrows():
    flat_curr=[element for elements in row['ner_io'] for element in elements]
    examples_set_train.append(len(set(flat_curr)))

idx = np.flip(np.argsort(examples_set_train))[:20]

In [7]:
# Data structure
class Entity(BaseModel):
    text: str = Field(description="Text of the token containing the entity such as 'ville'")
    label: str = Field(description="Label of Entity contained in the text and are exclusively: ['Domain-mark', 'Head', 'NC-Person', 'NC-Spatial', 'NP-Misc', 'NP-Person', 'NP-Spatial', 'Relation','Latlong','O'] ")

class Entities(BaseModel):
    entities: List[Entity] = Field(description="The token contained in the provided context")

In [8]:
examples=f'''Here are some examples:
EXAMPLE 1:
    INPUT:{' '.join(["('"+token['text']+"' ,"+str(id)+")" for id,token in enumerate(df_train.iloc[idx[1]]['tokens'])])}
    OUTPUT:{[{'label':tag,'text':token['text']} for tag,token in zip(df_train.iloc[idx[1]]['ner_io'],df_train.iloc[idx[1]]['tokens'])]}
---
'''

directives=f'''
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: (Domain-mark, Head, NC-Person, NC-Spatial, NP-Misc, NP-Person, NP-Spatial, Relation, O) and can be described as :
1.Domain-mark: words indicating the knowledge domain (usually after the head and between parenthesis) such as 'Géog., Géog. mod., Géog. anc., Géogr., Géogr. mod., Marine., Hist. nat., Gram., Géogr. anc., Jurisprud., Géog. anc. & mod., Gramm., Geog.'
2.Head: entry name at the beginning of the sentence and is almost always in uppercase such as 'Aire, Afrique, Aigle, ILLESCAS, MULHAUSEN, ADDA, SINTRA ou CINTRA, ACHSTEDE, ou AKSTEDE, KEITH, CAÇERES, CARMAGNOLE, AGRIGNON, INSPRUCK'
3.NC-Person: a common noun that identifies a person such as 'M., roi, S., peuples, l'empereur, son fils, les habitans, prince, peuple, le roi, fils, le P., habitans'
4.NC-Spatial: a common noun that identifies a spatial entity including natural features such as 'ville, petite ville, la riviere, la mer, royaume, la province, capitale, la ville, l'île, cette ville, pays, la côte, riviere'
5.NP-Misc: a proper noun identifying entities not classified as spatial or person such as 'l'Eglise, grec, 1707, russien, Glaciale, Noire, romain, la Croix, Russien, Parlement, 1693, Sud, 1614'
6.NP-Person: a proper noun identifying the name of a person (person named entities) such as 'Ptolomée, Pline, Strabon, Euripide, les Romains, Pierre, Romains, les Anglois, Turcs, Dieu, César, Antonin, les Espagnols'
7.NP-Spatial: a proper noun identifying the name of a place (spatial named entities) such as 'France, Allemagne, Italie, Espagne, Afrique, Asie, Paris, Naples, Angleterre, Rome, Russie, la Chine, l'Amérique méridionale'
8.Relation: spatial relation such as 'dans, sur, au, en, entre, près de, se jette dans, proche, par, vers, près du, jusqu'à, à l'orient'.
9.Latlong: geographic coordinates such as 'Long. 31. 58. lat. 40. 55, Long. 10. 27. lat. 43. 30, Long. 28. 14. lat. 51. 13, Long. 14. 46. lat. 56. 20, Long. 12. 8. lat. 39. 15, Long. 25. 20. lat. 44. 43, Lat. 19. 40, Long. selon Harris, 29. 16. 15. lat. 47. 15, Long. 14. 28. lat : 53. 50, Long. 57. lat. 38. 35, Long. 22. 52. lat. 43. 32, Long. 11. 18. lat. 40. 41, Long. 27. 40. lat. 51. 50'.
10.O: no entities are for this token
{examples}
'''



In [9]:
def run_gpt(model, input, directives):
    context = f'''{' '.join(["('" + token['text'] + "' ," + str(id) + ")" for id, token in enumerate(input)])}'''
    #print(context)
    template = f'''context:{context}
    query:{{query}}
    format_instructions:{{format_instructions}}
    '''
    # Set up a parser + inject instructions into the prompt template.
    parser = JsonOutputParser(pydantic_object=Entities)
    prompt = PromptTemplate(
        template=template,
        input_variables=["query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    chain = prompt | model | parser
    return chain.invoke({"query": directives})

In [10]:
version = 'o1-mini' # 'gpt3.5', 'gpt4', 'gpt4o', 'o1-mini'
model_name = 'o1-mini-2024-09-12' #'gpt-3.5-turbo', 'gpt-4', 'gpt-4o-mini-2024-07-18', 'gpt-4o', 'o1-preview', 'o1-mini-2024-09-12'

model = ChatOpenAI(temperature=1, model=model_name)
nb_iterations = 4

In [None]:
# Test the model with the first entry of the test set and check the output
#run_gpt(model, df_test.iloc[0]['tokens'], directives)

In [11]:
output_path = os.path.join('predictions', 'token_classification_' + version)

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
index = range(0, len(df_test.index))
for i in range(nb_iterations):
    pred_sentences = []
    for j in tqdm(index):
        try:
            output = run_gpt(model, df_test.iloc[j]['tokens'], directives)
            pred_sentences.append(output)
        except Exception as e:
            print(f"Error for index {j}: {e}")
            pred_sentences.append({'entities':[]})
    
        with open(os.path.join(output_path,"run_" + str(i+1) + ".json"), "w") as file:
            json.dump(pred_sentences, file)