In [14]:
import sys
sys.path.append("../src")
from mistral_client import run_mistral
from ner_post_processing import parse_entities_promptner, get_token_labels

import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from collections import defaultdict

In [15]:
dataset = load_dataset("DFKI-SLT/cross_ner", "politics")

class_labels = dataset["validation"].features["ner_tags"].feature.names
index2label = {i: label for i, label in enumerate(class_labels)}
label2index = {v: k for k, v in index2label.items()}

label2index

{'O': 0,
 'B-academicjournal': 1,
 'I-academicjournal': 2,
 'B-album': 3,
 'I-album': 4,
 'B-algorithm': 5,
 'I-algorithm': 6,
 'B-astronomicalobject': 7,
 'I-astronomicalobject': 8,
 'B-award': 9,
 'I-award': 10,
 'B-band': 11,
 'I-band': 12,
 'B-book': 13,
 'I-book': 14,
 'B-chemicalcompound': 15,
 'I-chemicalcompound': 16,
 'B-chemicalelement': 17,
 'I-chemicalelement': 18,
 'B-conference': 19,
 'I-conference': 20,
 'B-country': 21,
 'I-country': 22,
 'B-discipline': 23,
 'I-discipline': 24,
 'B-election': 25,
 'I-election': 26,
 'B-enzyme': 27,
 'I-enzyme': 28,
 'B-event': 29,
 'I-event': 30,
 'B-field': 31,
 'I-field': 32,
 'B-literarygenre': 33,
 'I-literarygenre': 34,
 'B-location': 35,
 'I-location': 36,
 'B-magazine': 37,
 'I-magazine': 38,
 'B-metrics': 39,
 'I-metrics': 40,
 'B-misc': 41,
 'I-misc': 42,
 'B-musicalartist': 43,
 'I-musicalartist': 44,
 'B-musicalinstrument': 45,
 'I-musicalinstrument': 46,
 'B-musicgenre': 47,
 'I-musicgenre': 48,
 'B-organisation': 49,
 'I-o

In [16]:
!shuf -n 10000 ../data/unlabeled/politics_tasklevel.txt > ../data/unlabeled/politics_tasklevel_uniform10000.txt

In [17]:
fp = "../data/unlabeled/politics_tasklevel_uniform10000.txt"
fp_scored = "../data/scored/politics_tasklevel_uniform10000.csv"

In [18]:
sentences = []
with open(fp, "r") as file:

        while True:
            line = file.readline()  # Read a single line
            if not line:  # Break out of the loop when the end of the file is reached
                break
            line = line.strip()
            sentences.append(line)

In [19]:
prompt = lambda text: f"""
Dfn: An entity is a person (person), organisation (organisation), politician (politician), political party (politicalparty), event (event), election (election), 
country (country), location (location) or other political entity (misc). Dates, times, abstract concepts, adjectives, and verbs are not entities.

Example 1: Sitting as a Liberal Party of Canada Member of Parliament (MP) for Niagara Falls, she joined the Canadian Cabinet after the Liberals defeated the 
Progressive Conservative Party of Canada government of John Diefenbaker in the 1963 Canadian federal election.

Answer:
1. Liberal Party of Canada | True | as it is a political party (politicalparty)
2. Parliament | True | as it is an organisation (organisation)
3. Niagara Falls | True | as it is a location (location)
4. Canadian Cabinet | True | as it is a political entity (misc)
5. Liberals | True | as it is a political group by not the party name (misc)
6. Progressive Conservative Party of Canada | True | as it is a political party (politicalparty)
7. government | False | as it is not actually an entity in this sentence
8. John Diefenbaker | True | as it is a politician (politician)
9. 1963 Canadian federal election | True | as it is an election (election)

Example 2: The MRE took part to the consolidation of The Olive Tree as a joint electoral list both for the
2004 European Parliament election and the 2006 Italian general election, along with the Democrats of the Left
and Democracy is Freedom - The Daisy.

Answer:
1. MRE | True | as it is a political party (politicalparty)
2. consolidation | False | as it is an action
3. The Olive Tree | True | as it is a group or organisation (organisation)
4. 2004 European Parliament election | True | as it is an election (election)
5. 2006 Italian general election | True | as it is an election (election)
6. Democrats of the Left | True | as it is a political party (politicalparty)
7. Democracy is Freedom - The Daisy | True | as it is an political party (politicalparty)

Q. Given the paragraph below, identify a list of possible entities and for each entry explain why it either is or is not an entity.

Paragraph: {text}
"""

In [20]:
scored = defaultdict(list)

for idx, text in enumerate(tqdm(sentences)):
    if (idx + 1) % 100 == 0:
        df_scored = pd.DataFrame(scored)
        df_scored.to_csv(fp_scored, index=False)
    
    try:
        tokens = text.split()
        prompt_input = prompt(text)
        output = run_mistral(prompt_input)
        ner_tags = get_token_labels(text, parse_entities_promptner(output), label2index)

        scored["tokens"].append(text.split(" "))
        scored["prompt"].append(prompt_input)
        scored["output"].append(output)
        scored["ner_tags"].append(ner_tags)
    except Exception as e:
        print(e)
        continue

df_scored = pd.DataFrame(scored)
df_scored.to_csv(fp_scored, index=False)

  7%|▋         | 654/10000 [47:52<9:34:34,  3.69s/it] 

In [None]:
df_scored

Unnamed: 0,tokens,prompt,output,ner_tags
0,"[Kurniawan, Karman, (, born, 29, March, 1991, ...","\nDfn: An entity is a person (person), organis...",1. Kurniawan Karman | True | as it is a person...,"[51, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,"[In, particular, the, book, is, concerned, wit...","\nDfn: An entity is a person (person), organis...",1. Lord Glasman | True | as it is a politician...,"[0, 0, 0, 0, 0, 0, 0, 57, 58, 0, 0, 0, 0, 0, 0..."
2,"[After, his, post, in, the, League, of, Nation...","\nDfn: An entity is a person (person), organis...",1. League of Nations | True | as it is an inte...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 58, 58, 0, 0, ..."
3,"[He, has, been, returned, in, every, election,...","\nDfn: An entity is a person (person), organis...","1. He | True | as it is a person (person), alt...","[51, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0..."
4,"[PT, Merpati, Nusantara, Airlines, ,, operatin...","\nDfn: An entity is a person (person), organis...",1. PT Merpati Nusantara Airlines | True | as i...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
994,"[The, elections, will, coincide, with, the, 20...","\nDfn: An entity is a person (person), organis...",1. Elections | False | as it is a general term...,"[0, 0, 0, 0, 0, 0, 25, 21, 22, 26, 26, 0, 0, 0..."
995,"[Caro, was, released, in, the, second, round, ...","\nDfn: An entity is a person (person), organis...","1. Caro | True | as it is a person (person), p...","[51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 21, ..."
996,"[After, leaving, his, country, 's, service, ,,...","\nDfn: An entity is a person (person), organis...",1. His country | True | as it is a country (co...,"[0, 0, 21, 22, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, ..."
997,"[Incumbent, Commissioner, Steve, Troxler, (, R...","\nDfn: An entity is a person (person), organis...",1. Incumbent Commissioner Steve Troxler | True...,"[57, 58, 58, 58, 0, 55, 0, 0, 57, 58, 58, 0, 5..."
