In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast
import matplotlib.pyplot as plt
from openai import AsyncOpenAI
import dotenv
import json
import asyncio
dotenv.load_dotenv()

True

In [2]:
client = AsyncOpenAI()

In [3]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    # Remove null values if any
    df.dropna(inplace=True)
    print("Number of records: ",len(df))
    return df

In [4]:
df = load_data("data/test.csv")
df.head()

Number of records:  9592


Unnamed: 0,Sentence,Tag
0,"In Tehran , the chief of Iran 's Revolutionary...","['O', 'B-tim', 'O', 'O', 'O', 'O', 'B-geo', 'O..."
1,"Even though both sites are now functioning , T...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-or..."
2,Suspected U.S. drones have carried out at leas...,"['O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', '..."
3,President Barack Obama has reaffirmed his beli...,"['B-per', 'I-per', 'I-per', 'O', 'O', 'O', 'O'..."
4,They all decided that one person should get of...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
# Convert all tags to a list
df['Tag'] = df['Tag'].apply(lambda x: ast.literal_eval(x))
df

Unnamed: 0,Sentence,Tag
0,"In Tehran , the chief of Iran 's Revolutionary...","[O, B-tim, O, O, O, O, B-geo, O, B-geo, I-geo,..."
1,"Even though both sites are now functioning , T...","[O, O, O, O, O, O, O, O, B-org, O, O, O, O, O,..."
2,Suspected U.S. drones have carried out at leas...,"[O, B-geo, O, O, O, O, O, O, O, O, O, O, O, O,..."
3,President Barack Obama has reaffirmed his beli...,"[B-per, I-per, I-per, O, O, O, O, O, O, O, O, ..."
4,They all decided that one person should get of...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
9587,Afghanistan 's Islamic Council also urged Pres...,"[B-org, I-org, I-org, I-org, O, O, B-per, I-pe..."
9588,Damascus denies U.S. intelligence allegations ...,"[B-org, O, B-geo, O, O, O, O, O, O, O, O, O, O..."
9589,Polish Prime Minister Jaroslaw Kaczynski has v...,"[B-gpe, B-per, O, B-per, I-per, O, O, O, O, O,..."
9590,The Commerce Department says the deficit in U....,"[O, B-org, I-org, O, O, O, O, B-geo, O, O, O, ..."


In [6]:
async def evaluate_sentence(sentence):
    sentence = str(sentence.split())
    
    prompt = """
You are a ner model that can predict the named entity tags for a given sentence.
You will be given a list of words and you will have to predict the named entity tags for each word.
Make sure to predict one from the list below for each word. Only use the tags below.

Here are the named entity tags:
B-geo (Beginning of a Geographical Entity): Marks the beginning of a geographical name, such as rivers, mountains, regions.
I-geo (Inside a Geographical Entity): Continues a geographical name already started by a B-GEO tag.
B-gpe (Beginning of a Geopolitical Entity): Marks the beginning of the name of a country, city, state, or other political region.
I-gpe (Inside a Geopolitical Entity): Continues a geopolitical name already started by a B-GPE tag.
B-org (Beginning of an Organization): Marks the beginning of the name of an organization, such as companies, government agencies, NGOs.
I-org (Inside an Organization): Continues an organization name already started by a B-ORG tag.
B-per (Beginning of a Person's Name): Marks the beginning of a person's name.
I-per (Inside a Person's Name): Continues a person's name already started by a B-PER tag.
B-tim (Beginning of a Time Expression): Marks the beginning of a time expression, such as dates, days, months, years, and time periods.
I-tim (Inside a Time Expression): Continues a time expression already started by a B-TIM tag.
B-art (Beginning of an Artifact Name): Marks the beginning of the name of man-made objects, like buildings, artworks, and vehicles.
I-art (Inside an Artifact Name): Continues an artifact name already started by a B-ART tag.
B-eve (Beginning of an Event Name): Marks the beginning of the name of events, like battles, wars, sports events, and natural disasters.
I-eve (Inside an Event Name): Continues an event name already started by a B-EVE tag.
B-nat (Beginning of a Natural Phenomenon): Marks the beginning of the name of natural phenomena, like hurricanes, earthquakes, and other natural events.
I-nat (Inside a Natural Phenomenon): Continues a natural phenomenon name already started by a B-NAT tag.
O (Outside of Named Entity): Indicates that the word does not belong to any of the named entity categories above.

Only return a json parsable list of dictionaries with two keys: "word" and "tag".
Surround all keys and values with double quotes.
Additionally, make sure to convert any non-UTF-8 characters to their correct UTF-8 representation.
Example:
[
  {"word": "", "tag": ""},
]
""" + f"Input: {sentence}\n"
    
    message = [
        {"role": "user", "content": prompt}
    ]
    
    response = await client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=message,
        temperature=0,
        max_tokens=1000,
    )
    
    result = response.choices[0].message.content
    
    if "```json" in result:
        result = result.split("```json")[1]
        result = result.split("```")[0]
        
    try:
        # Turn values into a list
        data = ast.literal_eval(result)
        tags = [data[i]['tag'] for i in range(len(data))]
        
        # If length of tags is 1 less than the length of row["Tag"], append a "O" tag
        if len(tags) == len(sentence.split()) - 1:
            tags.append("O")
    except Exception as e:
        tags = []
        print(response.choices[0].message.content)
        # Save result to file
        with open("errors.txt", "a") as f:
            f.write(f"Sentence: {sentence}\n")
            f.write(f"Result: {response.choices[0].message.content}\n")
            f.write(f"Error: {e}\n\n")
        print(e)
    
    return tags

async def process_row(row, index, semaphore):
    async with semaphore:
        sentence = row['Sentence']
        tags = await evaluate_sentence(sentence)
        row['Predicted_Tags'] = tags
        print(f"Finished processing row {index}")
        return row
    
async def process_dataframe(df, max_concurrent_tasks=20):
    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    tasks = [process_row(row, index, semaphore) for index, row in df.iterrows()]
    return await asyncio.gather(*tasks)

In [7]:
processed_df = await process_dataframe(df)
processed_df = pd.DataFrame(processed_df)

Finished processing row 17
Finished processing row 11
Finished processing row 9
Finished processing row 15
Finished processing row 10
Finished processing row 7
Finished processing row 2
Finished processing row 12
Finished processing row 1
Finished processing row 4
Finished processing row 5
Finished processing row 13
Finished processing row 21
Finished processing row 16
Finished processing row 18
Finished processing row 0
Finished processing row 14
Finished processing row 8
Finished processing row 3
Finished processing row 6
Finished processing row 20
Finished processing row 22
Finished processing row 28
Finished processing row 31
Finished processing row 27
Finished processing row 26
Finished processing row 19
Finished processing row 30
Finished processing row 33
Finished processing row 40
Finished processing row 25
Finished processing row 39
Finished processing row 35
Finished processing row 23
Finished processing row 24
Finished processing row 34
Finished processing row 29
Finished pr

In [8]:
processed_df

Unnamed: 0,Sentence,Tag,Predicted_Tags
0,"In Tehran , the chief of Iran 's Revolutionary...","[O, B-tim, O, O, O, O, B-geo, O, B-geo, I-geo,...","[O, B-geo, O, O, O, O, B-org, O, B-org, I-org,..."
1,"Even though both sites are now functioning , T...","[O, O, O, O, O, O, O, O, B-org, O, O, O, O, O,...","[O, O, O, O, O, O, O, O, B-org, O, O, O, O, O,..."
2,Suspected U.S. drones have carried out at leas...,"[O, B-geo, O, O, O, O, O, O, O, O, O, O, O, O,...","[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,..."
3,President Barack Obama has reaffirmed his beli...,"[B-per, I-per, I-per, O, O, O, O, O, O, O, O, ...","[B-per, I-per, I-per, O, O, O, O, O, O, O, O, ..."
4,They all decided that one person should get of...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
9587,Afghanistan 's Islamic Council also urged Pres...,"[B-org, I-org, I-org, I-org, O, O, B-per, I-pe...","[B-geo, O, O, O, O, O, O, B-per, O, O, O, O, O..."
9588,Damascus denies U.S. intelligence allegations ...,"[B-org, O, B-geo, O, O, O, O, O, O, O, O, O, O...","[B-geo, O, B-gpe, O, O, O, O, O, O, O, O, O, O..."
9589,Polish Prime Minister Jaroslaw Kaczynski has v...,"[B-gpe, B-per, O, B-per, I-per, O, O, O, O, O,...","[B-gpe, O, O, B-per, I-per, O, O, O, O, O, O, ..."
9590,The Commerce Department says the deficit in U....,"[O, B-org, I-org, O, O, O, O, B-geo, O, O, O, ...","[O, B-org, I-org, O, O, O, O, B-gpe, O, O, O, ..."


In [9]:
# Print number of rows where length of Tag is not equal to length of Predicted_Tags
num = len(processed_df[processed_df['Tag'].apply(lambda x: len(x)) != processed_df['Predicted_Tags'].apply(lambda x: len(x))])


In [10]:
async def reprocess_mismatched_rows(df):
    # Step 1: Filter rows where the length of 'Tag' is not equal to 'Predicted_Tags'
    mismatched_df = df[df['Tag'].apply(len) != df['Predicted_Tags'].apply(len)]

    # Step 2: Reapply the processing function to these rows
    reprocessed_rows = await process_dataframe(mismatched_df)

    # Step 3: Update the original DataFrame
    for row in reprocessed_rows:
        index = row.name  # Assuming each row has a unique index
        df.at[index, 'Predicted_Tags'] = row['Predicted_Tags']

    return df
times = 0
while num > 0 and times < 5:
    processed_df = await reprocess_mismatched_rows(processed_df)
    num = len(processed_df[processed_df['Tag'].apply(lambda x: len(x)) != processed_df['Predicted_Tags'].apply(lambda x: len(x))])
    times += 1
    print(f"Number of mismatched rows: {num}")

Finished processing row 215
Number of mismatched rows: 1
Finished processing row 215
Number of mismatched rows: 1
Finished processing row 215
Number of mismatched rows: 1
Finished processing row 215
Number of mismatched rows: 1
Finished processing row 215
Number of mismatched rows: 1


In [11]:
for index, row in processed_df.iterrows():
    if len(row['Tag']) != len(row['Predicted_Tags']):
        print(f"Index: {index}")
        print(f"Tag: {row['Tag']}")
        print(f"Predicted Tags: {row['Predicted_Tags']}")
        print(f"Length of Tag: {len(row['Tag'])}")
        print(f"Length of Predicted Tags: {len(row['Predicted_Tags'])}")
        print(f"Sentence: {row['Sentence']}")
        print()

Index: 215
Tag: ['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'I-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'B-geo', 'O', 'B-geo', 'O', 'B-geo', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'O']
Predicted Tags: ['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'I-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-art', 'O', 'B-geo', 'O', 'B-art', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'O']
Length of Tag: 48
Length of Predicted Tags: 49
Sentence: It says China might take action to secure Chinese interests in a gas field in the East China Sea , to seize a disputed set of islands known as Daioyu in China and Senkaku in Japanto or to keep U.S. forces based in Japan from protecting Taiwan .



In [13]:
# For the rows where the length of Tag is not equal to the length of Predicted_Tags
# If the length of Tag is greater than the length of Predicted_Tags, keep appending "O" to the end of Predicted_Tags
# If the length of Tag is less than the length of Predicted_Tags, remove the last elements until the length of Predicted_Tags is equal to the length of Tag

for index, row in processed_df.iterrows():
    tag_len = len(row['Tag'])
    predicted_tag_len = len(row['Predicted_Tags'])

    if tag_len != predicted_tag_len:
        if tag_len > predicted_tag_len:
            # Append "O" until lengths are equal
            row['Predicted_Tags'].extend(["O"] * (tag_len - predicted_tag_len))
        else:
            # Remove extra elements from the end until lengths are equal
            row['Predicted_Tags'] = row['Predicted_Tags'][:tag_len]
# Print number of rows where length of Tag is not equal to length of Predicted_Tags
num = len(processed_df[processed_df['Tag'].apply(lambda x: len(x)) != processed_df['Predicted_Tags'].apply(lambda x: len(x))])
num

0

In [14]:
correct_predictions = 0
total_tags = 0

for _, row in processed_df.iterrows():
    total_tags += len(row['Tag'])
    correct_predictions += sum(predicted.lower() == actual.lower() for predicted, actual in zip(row['Predicted_Tags'], row['Tag']))

accuracy = correct_predictions / total_tags if total_tags > 0 else 0

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9245


In [15]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Flatten the 'Tag' and 'Predicted_Tags' columns into lists
actual_tags = [tag for row in processed_df['Tag'] for tag in row]
predicted_tags = [tag for row in processed_df['Predicted_Tags'] for tag in row]

# Calculate precision, recall, and F1-score for each tag
precision, recall, f1, _ = precision_recall_fscore_support(actual_tags, predicted_tags, average=None, labels=np.unique(actual_tags))

# Map these metrics to each unique tag
tag_metrics = dict()
unique_tags = np.unique(actual_tags)
for i, tag in enumerate(unique_tags):
    tag_metrics[tag] = {
        'precision': precision[i],
        'recall': recall[i],
        'f1_score': f1[i]
    }

# Display the metrics for each tag
for tag, metrics in tag_metrics.items():
    print(f"Tag: {tag}\nPrecision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1 Score: {metrics['f1_score']:.4f}")


Tag: B-art
Precision: 0.0629, Recall: 0.1163, F1 Score: 0.0816
Tag: B-eve
Precision: 0.0864, Recall: 0.1167, F1 Score: 0.0993
Tag: B-geo
Precision: 0.7359, Recall: 0.7828, F1 Score: 0.7586
Tag: B-gpe
Precision: 0.6985, Recall: 0.8441, F1 Score: 0.7644
Tag: B-nat
Precision: 0.0318, Recall: 0.1000, F1 Score: 0.0483
Tag: B-org
Precision: 0.5857, Recall: 0.5553, F1 Score: 0.5701
Tag: B-per
Precision: 0.6711, Recall: 0.6707, F1 Score: 0.6709
Tag: B-tim
Precision: 0.6101, Recall: 0.7819, F1 Score: 0.6854
Tag: I-art
Precision: 0.0347, Recall: 0.0862, F1 Score: 0.0495
Tag: I-eve
Precision: 0.1143, Recall: 0.1509, F1 Score: 0.1301
Tag: I-geo
Precision: 0.4789, Recall: 0.6903, F1 Score: 0.5655
Tag: I-gpe
Precision: 0.0540, Recall: 0.5385, F1 Score: 0.0981
Tag: I-nat
Precision: 0.0390, Recall: 0.2500, F1 Score: 0.0674
Tag: I-org
Precision: 0.6336, Recall: 0.5249, F1 Score: 0.5742
Tag: I-per
Precision: 0.8192, Recall: 0.7483, F1 Score: 0.7822
Tag: I-tim
Precision: 0.3243, Recall: 0.4762, F1 Score:

In [12]:
processed_df.to_csv("data/test_predictions_gpt35_zero.csv", index=False)