In [24]:
import sys
sys.path.append("../src")
from mistral_client import run_mistral
from ner_post_processing import parse_entities_promptner, get_token_labels

import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from collections import defaultdict

In [25]:
dataset = load_dataset("DFKI-SLT/cross_ner", "politics")

class_labels = dataset["validation"].features["ner_tags"].feature.names
index2label = {i: label for i, label in enumerate(class_labels)}
label2index = {v: k for k, v in index2label.items()}

label2index

{'O': 0,
 'B-academicjournal': 1,
 'I-academicjournal': 2,
 'B-album': 3,
 'I-album': 4,
 'B-algorithm': 5,
 'I-algorithm': 6,
 'B-astronomicalobject': 7,
 'I-astronomicalobject': 8,
 'B-award': 9,
 'I-award': 10,
 'B-band': 11,
 'I-band': 12,
 'B-book': 13,
 'I-book': 14,
 'B-chemicalcompound': 15,
 'I-chemicalcompound': 16,
 'B-chemicalelement': 17,
 'I-chemicalelement': 18,
 'B-conference': 19,
 'I-conference': 20,
 'B-country': 21,
 'I-country': 22,
 'B-discipline': 23,
 'I-discipline': 24,
 'B-election': 25,
 'I-election': 26,
 'B-enzyme': 27,
 'I-enzyme': 28,
 'B-event': 29,
 'I-event': 30,
 'B-field': 31,
 'I-field': 32,
 'B-literarygenre': 33,
 'I-literarygenre': 34,
 'B-location': 35,
 'I-location': 36,
 'B-magazine': 37,
 'I-magazine': 38,
 'B-metrics': 39,
 'I-metrics': 40,
 'B-misc': 41,
 'I-misc': 42,
 'B-musicalartist': 43,
 'I-musicalartist': 44,
 'B-musicalinstrument': 45,
 'I-musicalinstrument': 46,
 'B-musicgenre': 47,
 'I-musicgenre': 48,
 'B-organisation': 49,
 'I-o

In [26]:
!shuf -n 1000 ../data/unlabeled/politics_tasklevel.txt > ../data/unlabeled/politics_tasklevel_uniform1000.txt

In [27]:
fp = "../data/unlabeled/politics_tasklevel_uniform1000.txt"
fp_scored = "../data/scored/politics_tasklevel_uniform1000.csv"

In [28]:
sentences = []
with open(fp, "r") as file:

        while True:
            line = file.readline()  # Read a single line
            if not line:  # Break out of the loop when the end of the file is reached
                break
            line = line.strip()
            sentences.append(line)

In [29]:
prompt = lambda text: f"""
Dfn: An entity is a person (person), organisation (organisation), politician (politician), political party (politicalparty), event (event), election (election), 
country (country), location (location) or other political entity (misc). Dates, times, abstract concepts, adjectives, and verbs are not entities.

Example 1: Sitting as a Liberal Party of Canada Member of Parliament (MP) for Niagara Falls, she joined the Canadian Cabinet after the Liberals defeated the 
Progressive Conservative Party of Canada government of John Diefenbaker in the 1963 Canadian federal election.

Answer:
1. Liberal Party of Canada | True | as it is a political party (politicalparty)
2. Parliament | True | as it is an organisation (organisation)
3. Niagara Falls | True | as it is a location (location)
4. Canadian Cabinet | True | as it is a political entity (misc)
5. Liberals | True | as it is a political group by not the party name (misc)
6. Progressive Conservative Party of Canada | True | as it is a political party (politicalparty)
7. government | False | as it is not actually an entity in this sentence
8. John Diefenbaker | True | as it is a politician (politician)
9. 1963 Canadian federal election | True | as it is an election (election)

Example 2: The MRE took part to the consolidation of The Olive Tree as a joint electoral list both for the
2004 European Parliament election and the 2006 Italian general election, along with the Democrats of the Left
and Democracy is Freedom - The Daisy.

Answer:
1. MRE | True | as it is a political party (politicalparty)
2. consolidation | False | as it is an action
3. The Olive Tree | True | as it is a group or organisation (organisation)
4. 2004 European Parliament election | True | as it is an election (election)
5. 2006 Italian general election | True | as it is an election (election)
6. Democrats of the Left | True | as it is a political party (politicalparty)
7. Democracy is Freedom - The Daisy | True | as it is an political party (politicalparty)

Q. Given the paragraph below, identify a list of possible entities and for each entry explain why it either is or is not an entity.

Paragraph: {text}
"""

In [30]:
scored = defaultdict(list)

for idx, text in enumerate(tqdm(sentences)):
    if (idx + 1) % 100 == 0:
        df_scored = pd.DataFrame(scored)
        df_scored.to_csv(fp_scored, index=False)
    
    try:
        tokens = text.split()
        prompt_input = prompt(text)
        output = run_mistral(prompt_input)
        ner_tags = get_token_labels(text, parse_entities_promptner(output), label2index)

        scored["tokens"].append(text)
        scored["prompt"].append(prompt_input)
        scored["output"].append(output)
        scored["ner_tags"].append(ner_tags)
    except Exception as e:
        print(e)
        continue

df_scored = pd.DataFrame(scored)
df_scored.to_csv(fp_scored, index=False)

  0%|          | 4/1000 [00:22<1:44:47,  6.31s/it]

'B-politician, politicalparty'


  6%|▌         | 59/1000 [04:49<1:14:37,  4.76s/it]

'B-politicalparty/organisation'


  9%|▉         | 94/1000 [07:28<59:21,  3.93s/it]  

'B-country/misc'


 10%|▉         | 97/1000 [07:47<1:18:30,  5.22s/it]

'B-politicalparty or misc'


 18%|█▊        | 175/1000 [36:00<11:22:46, 49.66s/it] 

'B-people'


 20%|██        | 200/1000 [37:47<53:34,  4.02s/it]   

'B-persons'


 20%|██        | 201/1000 [37:50<50:58,  3.83s/it]

'B-people'


 22%|██▏       | 222/1000 [39:24<1:01:44,  4.76s/it]

'B-people'


 25%|██▍       | 246/1000 [41:18<1:03:14,  5.03s/it]

'B-politicalparty or misc'


 25%|██▌       | 254/1000 [41:52<51:30,  4.14s/it]  

'B-people'


 26%|██▋       | 265/1000 [42:44<51:48,  4.23s/it]  

'B-people'


 27%|██▋       | 269/1000 [43:03<1:03:14,  5.19s/it]

"B-could be a political party or organisation, but without more context, it's misc"


 28%|██▊       | 279/1000 [43:54<39:52,  3.32s/it]  

'B-person or organisation'


 31%|███       | 308/1000 [51:36<6:14:25, 32.46s/it] 

'B-politicalparty/organisation'


 32%|███▏      | 317/1000 [52:21<1:12:43,  6.39s/it]

'B-politician or individual'


 34%|███▍      | 338/1000 [1:00:02<9:20:15, 50.78s/it]

Unexpected exception (ReadTimeout): The read operation timed out


 36%|███▌      | 359/1000 [1:14:14<5:53:53, 33.13s/it]  

'B-politicalparty or organisation'


 39%|███▉      | 388/1000 [1:16:28<38:33,  3.78s/it]  

'B-people'


 39%|███▉      | 389/1000 [1:16:32<40:07,  3.94s/it]

'B-politicalparty or organisation'


 40%|███▉      | 395/1000 [1:16:56<43:15,  4.29s/it]

'B-politician or other individual, in this case a historian'


 41%|████      | 409/1000 [1:18:01<37:30,  3.81s/it]

'B-politicalparty or organisation'


 43%|████▎     | 433/1000 [1:19:35<31:35,  3.34s/it]

'B-people'


 46%|████▌     | 459/1000 [1:22:14<35:44,  3.96s/it]  

'B-country/misc'


 47%|████▋     | 467/1000 [1:22:57<46:01,  5.18s/it]

'B-organisation or misc'


 49%|████▉     | 491/1000 [1:24:56<39:39,  4.68s/it]

'B-politicalparty/organisation'


 50%|████▉     | 495/1000 [1:25:14<37:46,  4.49s/it]

'B-politician/military official'


 57%|█████▋    | 570/1000 [1:31:16<33:22,  4.66s/it]

'B-politician, assuming it refers to a person in this context'


 60%|█████▉    | 597/1000 [1:33:20<27:37,  4.11s/it]

'B-country/misc'


 61%|██████▏   | 614/1000 [1:34:52<38:42,  6.02s/it]

'B-China'


 63%|██████▎   | 632/1000 [1:36:23<33:38,  5.49s/it]

'B-people'


 64%|██████▍   | 642/1000 [1:37:11<30:40,  5.14s/it]

'B-politicalparty/misc'


 70%|██████▉   | 696/1000 [1:41:07<25:44,  5.08s/it]

'B-people'


 73%|███████▎  | 727/1000 [1:43:45<27:27,  6.03s/it]

'B-people'


 79%|███████▉  | 790/1000 [1:48:35<15:19,  4.38s/it]

'B-politician or other individual, not specified'


 79%|███████▉  | 793/1000 [1:48:50<17:01,  4.93s/it]

'B-could be the Croatian Democratic Union, a political party in Croatia'


 81%|████████▏ | 814/1000 [1:50:27<10:28,  3.38s/it]

'B-politicalparty or organisation'


 83%|████████▎ | 832/1000 [1:51:33<10:51,  3.88s/it]

'B-persons'


 84%|████████▍ | 841/1000 [1:52:12<11:09,  4.21s/it]

'B-politicalparty or organisation'


 88%|████████▊ | 885/1000 [1:55:14<08:54,  4.65s/it]

'B-although military personnel, they are considered politicians in this context'


 93%|█████████▎| 929/1000 [1:58:21<04:41,  3.97s/it]

'B-location/organisation'


 95%|█████████▍| 948/1000 [1:59:43<04:01,  4.64s/it]

'B-politician/celebrity'


 98%|█████████▊| 980/1000 [2:01:56<01:18,  3.92s/it]

'B-animation studio'


 99%|█████████▊| 987/1000 [2:02:27<00:52,  4.07s/it]

'B-politicalparty/organisation'


100%|█████████▉| 998/1000 [2:03:40<00:16,  8.31s/it]

'B-likely a politician'


100%|██████████| 1000/1000 [2:03:48<00:00,  7.43s/it]
