# Evaluation of Token Classification with OpenAI GPT Models


## Import Python packages

In [None]:
from os import listdir
from os.path import isfile, join
import json

import pandas as pd
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

from datasets import load_dataset

## Import dataset

In this experiment, we use the GeoEDdA dataset which contains semantic annotations (at the token and span levels) for named entities (i.e., Spatial, Person, and Misc), nominal entities, spatial relations, and geographic coordinates. Nested named entities also present in this dataset were not considered in this experiment.

The dataset is available in the HuggingFace Hub: https://huggingface.co/datasets/GEODE/GeoEDdA

* Load the GeoEDdA dataset from the HuggingFace Hub:

In [None]:
dataset = load_dataset("GEODE/GeoEDdA")
test_set = pd.DataFrame(dataset['test'])
test_set.head()

In [None]:
tagset = ['Domain-mark','Head','NC-Person','NC-Spatial','NP-Misc','NP-Person','NP-Spatial','Relation','Latlong']

def filter_annotations(doc, tagset):
    result = ['O'] * len(doc['tokens'])
    for span in doc['spans']:
        if(span['label'] in tagset):
            for i in range(span['token_start'], span['token_end'] + 1):
                if(result[i] == 'O' or span['label'] == 'Latlong'):
                    result[i] = span['label']
                elif(result[i] == ['Latlong']):
                    break
    return result


* Add a new column with the list of tags (one tag per token):

In [None]:
test_set['tags'] = test_set.apply(lambda x: filter_annotations(x, tagset), axis=1)
test_set.head()

## Evaluation

Several iterations of the token classification task have been performed. The predictions from all the iterations can be loaded and evaluated.

In [None]:
def load_predictions(path):
    files = sorted([f for f in listdir(path) if isfile(join(path, f))])
    predictions = [] # contains the predictions of each iteration
    for file in files:
        with open(join(path, file), encoding='utf-8') as f:
            predictions.append(json.load(f))

    return predictions


def formatting_ner(pred_sentence, true_sentence):
    formatted_pred_sentence = []
    i = 0
    j = 0
    while(i < len(pred_sentence['entities']) and j < len(true_sentence['tokens'])):
        if('text' not in list(pred_sentence['entities'][i].keys())):
            formatted_pred_sentence.append('O')
            i += 1
            j += 1
        elif(pred_sentence['entities'][i]['text'] == true_sentence['tokens'][j]['text'] and 'label' in list(pred_sentence['entities'][i].keys())):
            formatted_pred_sentence.append(pred_sentence['entities'][i]['label'])
            i += 1
            j += 1
        else:
            j += 1
            formatted_pred_sentence.append('O')
    while(j < len(true_sentence['tokens'])):
        formatted_pred_sentence.append('O')
        j += 1
    return formatted_pred_sentence


def format_sentences(pred, true):
    formatted_pred_sentences = []
    for pred_sentence, true_sentence in zip(pred, true.iterrows()):
        formatted_pred_sentences.append(formatting_ner(pred_sentence,true_sentence[1]))
    return formatted_pred_sentences


def evaluate(predictions, test_set):
    classification_reports = []
    for iteration_predictions in predictions:
        formatted_sentences = format_sentences(iteration_predictions, test_set)
        pred_sentences_flat = [element for pred_sentences in formatted_sentences for element in pred_sentences]
        trues_flat = [element for true_sentence in list(test_set['tags']) for element in true_sentence]
        classification_reports.append(classification_report(trues_flat, pred_sentences_flat, output_dict=True, digits=3, labels=[tag for tag in tagset if tag != 'O']))
    return classification_reports


def get_avg_scores(classification_reports, score='f1-score'):
    avg_scores = {}
    for report in classification_reports:
        for tag in report.keys():
            if tag not in avg_scores:
                avg_scores[tag] = []
            avg_scores[tag].append(report[tag][score])

    avg_scores = {tag: sum(scores)/len(scores) for tag, scores in avg_scores.items()}
    return avg_scores


def bar_plot(scores, tagset, score='f1-score'):
    fig, ax = plt.subplots(figsize=(10, 5))
    barWidth = 0.25
    bars = [[data[tag] for tag in tagset if tag != 'O'] for data in scores.values()]
    r = [range(len(bars[0]))]
    for i in range(1, len(bars)):
        r.append([x + barWidth + 0.02 for x in r[i-1]])
    colors = ['#0072B2', '#D55E00', '#CC79A7', '#E69F00', '#56B4E9'] # Colorblind-friendly palette
    for i in range(len(bars)):
        ax.bar(r[i], bars[i], color=colors[i], width=barWidth, label=f'{list(scores.keys())[i]}')
    ax.set_xticks([r + barWidth for r in range(len(bars[0]))])
    ax.set_xticklabels(tagset, rotation=30)
    ax.tick_params(bottom=False, left=False)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_color('#DDDDDD')
    ax.set_axisbelow(True)
    ax.yaxis.grid(True, color='#EEEEEE')
    ax.xaxis.grid(False)
    ax.set_xlabel('Token classes', labelpad=15, color='#333333')
    ax.set_ylabel(score, labelpad=15, color='#333333')
    plt.legend(loc=(1.04, 0.7))
    plt.show()

In [None]:
models = ['gpt3.5', 'gpt4', 'gpt4o']
metrics = ['precision', 'recall', 'f1-score']

* Get classification report for GPT 3.5: 

In [None]:
path = join('predictions','token_classification_' + models[0])

predictions = load_predictions(path)
classification_reports = evaluate(predictions, test_set)

# display report of the first iteration
classification_reports[0]

* Display average f1-scores for each tag:

In [None]:
get_avg_scores(classification_reports, score=metrics[2])

* Get micro average scores for each model:

In [None]:
for model in models:
    path = join('predictions','token_classification_' + model)
    predictions = load_predictions(path)
    classification_reports = evaluate(predictions, test_set)
    
    print(f'{model}', end=':\t')

    for metric in metrics:
        scores = get_avg_scores(classification_reports, metric)
        print(f'{metric}', end=': ')
        print(scores['micro avg'], end='\t')
        #print("{:.2f}".format(scores['micro avg']), end='\t')
    print()

* Plot average scores for each GPT model:

In [None]:
eval_scores = {}

for metric in metrics:
    for model in models:
        path = join('predictions','token_classification_' + model)
        predictions = load_predictions(path)
        classification_reports = evaluate(predictions, test_set)
        eval_scores[model] = get_avg_scores(classification_reports, metric)

    bar_plot(eval_scores, tagset, metric)


* Comparison with a [fine-tuned BERT](https://huggingface.co/GEODE/bert-base-french-cased-edda-ner) scores:

In [None]:
bert_f1scores =  {
    'Domain-mark': 0.99,
    'Head': 0.98,
    'NC-Person': 0.75,
    'NC-Spatial': 0.93,
    'NP-Misc': 0.76,
    'NP-Person': 0.88,
    'NP-Spatial': 0.95,
    'Relation': 0.91,
    'Latlong': 0.98
}

eval_scores = {}

for model in models:
    path = join('predictions','token_classification_' + model)
    predictions = load_predictions(path)
    classification_reports = evaluate(predictions, test_set)
    eval_scores[model] = get_avg_scores(classification_reports, metric)

eval_scores['fine-tuned BERT'] = bert_f1scores

bar_plot(eval_scores, tagset, metric)