In [1]:
import sys
sys.path.append("../src")
from mistral_client import run_mistral
from ner_post_processing import parse_entities_promptner, get_token_labels

import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("DFKI-SLT/cross_ner", "politics")

class_labels = dataset["validation"].features["ner_tags"].feature.names + ["B-frenchpolitician", "I-frenchpolitician", "B-frenchpoliticalparty", "I-frenchpoliticalparty"]
index2label = {i: label for i, label in enumerate(class_labels)}
label2index = {v: k for k, v in index2label.items()}

label2index

{'O': 0,
 'B-academicjournal': 1,
 'I-academicjournal': 2,
 'B-album': 3,
 'I-album': 4,
 'B-algorithm': 5,
 'I-algorithm': 6,
 'B-astronomicalobject': 7,
 'I-astronomicalobject': 8,
 'B-award': 9,
 'I-award': 10,
 'B-band': 11,
 'I-band': 12,
 'B-book': 13,
 'I-book': 14,
 'B-chemicalcompound': 15,
 'I-chemicalcompound': 16,
 'B-chemicalelement': 17,
 'I-chemicalelement': 18,
 'B-conference': 19,
 'I-conference': 20,
 'B-country': 21,
 'I-country': 22,
 'B-discipline': 23,
 'I-discipline': 24,
 'B-election': 25,
 'I-election': 26,
 'B-enzyme': 27,
 'I-enzyme': 28,
 'B-event': 29,
 'I-event': 30,
 'B-field': 31,
 'I-field': 32,
 'B-literarygenre': 33,
 'I-literarygenre': 34,
 'B-location': 35,
 'I-location': 36,
 'B-magazine': 37,
 'I-magazine': 38,
 'B-metrics': 39,
 'I-metrics': 40,
 'B-misc': 41,
 'I-misc': 42,
 'B-musicalartist': 43,
 'I-musicalartist': 44,
 'B-musicalinstrument': 45,
 'I-musicalinstrument': 46,
 'B-musicgenre': 47,
 'I-musicgenre': 48,
 'B-organisation': 49,
 'I-o

In [3]:
fp = "../data/unlabeled/politics_tasklevel.txt"
fp_scored = "../data/scored/politics_tasklevel_french_kw_v2.csv"

In [4]:
sentences = []
with open(fp, "r") as file:

        while True:
            line = file.readline()  # Read a single line
            if not line:  # Break out of the loop when the end of the file is reached
                break
            line = line.strip()
            sentences.append(line)

In [5]:
len(sentences)

501885

In [39]:
keywords = [
    "emmanuel macron", 
    "marine le pen", 
    "édouard philippe", 
    "charles de gaulle",
    "jean-luc mélenchon",
    "gilbert collard",
    "marion maréchal",
    "lionel jospin",
    "françois hollande",
    "augustin robespierrev",
    "manuel valls",
    "sylvie bermann",
    "thibaut monnier"
] + [
 'french president françois hollande',
 'mélenchon',
 'jean-marc ayrault',
 'jean-louis borloo',
 'president emmanuel macron',
 'charles de gaulle',
 'nicolas sarkozy',
 'hamon',
 'jospin',
 'jacques chirac',
 'ségolène royal',
 'chirac',
 'president françois hollande',
 'bayrou',
 'jean-yves le drian',
 'jean-luc mélenchon',
 'françois hollande',
 'françois fillon',
 'alain juppé',
 'laurent fabius',
 'emmanuel macron',
 'françois bayrou',
 'françois mitterrand',
 'sarkozy',
 'dominique de villepin',
 'jean-marie le pen',
 'le pen',
 'manuel valls',
 'dominique strauss-kahn',
 'douste-blazy',
 'french president emmanuel macron',
 'marine le pen',
 'hollande',
 'benoît hamon',
 'mitterrand',
 'president jacques chirac',
 'macron',
 'édouard philippe']
keywords_blocklist = ["france", "french"]

sentences_french = []
for sentence in tqdm(sentences):
    if any(keyword in sentence.lower() for keyword in keywords) and not any(keyword_blocklist in sentence.lower() for keyword_blocklist in keywords_blocklist):
        sentences_french.append(sentence)

100%|██████████| 501885/501885 [00:12<00:00, 38677.97it/s]


In [40]:
len(sentences_french)

949

In [41]:
prompt_french = lambda text: f"""
Dfn: An entity is a person (person), organisation (organisation), french politician (politician), french political party (politicalparty), event (event), election (election), 
country (country), location (location) or other political entity (misc). Dates, times, abstract concepts, adjectives, and verbs are not entities.

Example 1: In the 2014 European Parliament election in France , the National Front won the elections with 24.85 % of the vote , a swing of 18.55 % , winning 24 seats , up from 3 previously .

Answer:
1. 2014 European Parliament election | True | as it is an election (election)
2. France | True | as it is a country (country)
3. National Front | True | as it is a political party (frenchpoliticalparty)

Example 2: The FN received 33.9 % of the votes in the 2017 French presidential election , making it the largest Eurosceptic party in France .

Answer:
1. FN | True | as it is a political party (frenchpoliticalparty)
2. 2017 French presidential election | True | as it is an election (election)
3. Eurosceptic party | True | as it is a political party (frenchpoliticalparty)
4. France | True | as it is a country (country)

Example 3: The 2017 French presidential election caused a radical shift in French politics , as the prevailing parties of The Republicans and Socialists failed to make it to the second round of voting , with far-right Marine Le Pen and political newcomer Emmanuel Macron instead facing each other .

Answer:
1. 2017 French presidential election | True | as it is an election (election)
2. French politics | False | as it is an abstract concept, not a specific entity
3. The Republicans | True | as it is a political party (frenchpoliticalparty)
4. Socialists | True | as it is a political party (frenchpoliticalparty)
5. far-right | False | as it is an adjective describing a political orientation, not a specific entity
6. Marine Le Pen | True | as she is a French politician (frenchpolitician)
7. Emmanuel Macron | True | as he is a French politician (frenchpolitician)
8. political newcomer | False | as it is an abstract concept, not a specific entity

Q. Given the paragraph below, identify a list of possible entities and for each entry explain why it either is or is not an entity.

Paragraph: {text}
"""

In [43]:
scored = defaultdict(list)

for idx, text in enumerate(tqdm(sentences_french)):
    if (idx + 1) % 100 == 0:
        df_scored = pd.DataFrame(scored)
        df_scored.to_csv(fp_scored, index=False)
    
    try:
        tokens = text.split()
        prompt_input = prompt_french(text)
        output = run_mistral(prompt_input)
        ner_tags = get_token_labels(text, parse_entities_promptner(output), label2index)

        scored["tokens"].append(text.split(" "))
        scored["prompt"].append(prompt_input)
        scored["output"].append(output)
        scored["ner_tags"].append(ner_tags)
    except Exception as e:
        print(e)
        continue

df_scored = pd.DataFrame(scored)
df_scored.to_csv(fp_scored, index=False)

 32%|███▏      | 302/949 [24:47<48:59,  4.54s/it]  

In [None]:
df_scored

Unnamed: 0,tokens,prompt,output,ner_tags
0,"[On, 22, July, 2014, ,, King, Philippe, nomina...","\nDfn: An entity is a person (person), organis...","1. 22 July 2014 | False | as it is a date, not...","[0, 0, 0, 0, 0, 51, 52, 0, 79, 80, 0, 81, 0, 0..."
1,"[More, recently, ,, the, memory, of, Charles, ...","\nDfn: An entity is a person (person), organis...",1. Charles Martel | True | as he is a French p...,"[0, 0, 0, 0, 0, 0, 79, 80, 0, 0, 0, 0, 0, 0, 0..."
2,"[In, early, 2013, ,, documents, provided, by, ...","\nDfn: An entity is a person (person), organis...",1. early 2013 | False | as it is a time period...,"[0, 0, 0, 0, 0, 0, 0, 51, 52, 0, 0, 0, 0, 0, 0..."
3,"[Before, this, ,, three, territories, of, memb...","\nDfn: An entity is a person (person), organis...",1. Three territories | False | as it is a vagu...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 0, 0,..."
4,"[Distrust, of, national, government, was, high...","\nDfn: An entity is a person (person), organis...",1. Distrust of national government | False | a...,"[0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 21, 0..."
...,...,...,...,...
651,"[The, 2017, French, presidential, election, ca...","\nDfn: An entity is a person (person), organis...",1. 2017 French presidential election | True | ...,"[0, 25, 26, 26, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
652,"[In, an, interview, with, Francis, Brochet, fo...","\nDfn: An entity is a person (person), organis...",1. Francis Brochet | True | as he is a person ...,"[0, 0, 0, 0, 51, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
653,"[The, American-led, intervention, in, the, Syr...","\nDfn: An entity is a person (person), organis...",1. American-led intervention in the Syrian Civ...,"[0, 29, 30, 30, 30, 29, 30, 30, 0, 0, 0, 21, 2..."
654,"[Nils, Courbaron, (, born, January, 25, ,, 199...","\nDfn: An entity is a person (person), organis...",1. Nils Courbaron | True | as he is a person (...,"[51, 52, 0, 0, 0, 0, 0, 0, 0, 35, 0, 21, 0, 0,..."
