In [1]:
from vroom.NER import *
from vroom.alias import *
from vroom.baseline import *

In [45]:
#path = "../data/kaggle/prelude_a_fondation/chapter_1.txt.preprocessed"
path = "../../data/kaggle/prelude_a_fondation/chapter_1.txt.preprocessed"
entities, _ = get_entities_from_file(path)

entities = [entity for sublist in entities for entity in sublist]


# for entity in entities:
#     print(entity["word"])

In [46]:
get_aliases_jaro_winkler(entities, 0.60)

[['cléon_ier—', 'seldon', 'sire', 'cléon'],
 ['empereur', 'empereurs'],
 ['hari_seldon'],
 ['eto_demerzel', 'demerzel'],
 ['lieutenant_alban_wellis'],
 ['wellis'],
 ['trantor'],
 ['hummin']]

In [47]:
aliases = get_aliases_fuzzy_partial_token(entities, 90)
print("===== Fuzzy done =====")
for alias in aliases:
    print(alias)

===== Fuzzy done =====
['CLÉON Ier—', 'Cléon']
['Empereur', 'Empereurs']
['Hari Seldon', 'Seldon']
['Eto Demerzel', 'Demerzel']
['Sire']
['Lieutenant Alban Wellis', 'Wellis']
['Trantor']
['Hummin']


In [48]:
aliases = get_aliases_fuzzy_partial_token(entities, 90)
print("===== Fuzzy done =====")
for alias in aliases:
    print(alias)

===== Fuzzy done =====
['CLÉON Ier—', 'Cléon']
['Empereur', 'Empereurs']
['Hari Seldon', 'Seldon']
['Eto Demerzel', 'Demerzel']
['Sire']
['Lieutenant Alban Wellis', 'Wellis']
['Trantor']
['Hummin']


## Evaluating aliases resolution

In [55]:
from vroom.NER import *
from vroom.metrics import evaluate
import os

unlabeled_chapter = os.path.join("../../data", "test_set", "prelude_a_fondation", "chapter_1.unlabeled")
labeled_chapter = os.path.join("../../data", "test_set", "prelude_a_fondation", "chapter_1.labeled")

entities, chunks = get_entities_from_file(unlabeled_chapter, device="cuda")

all_entities_names = []
for chunk in entities: 
    all_entities_names += [
        entity["word"] for entity in chunk
    ]
entities = set(all_entities_names)

unlabeled_text_tagged = tag_text_with_entities(unlabeled_chapter, entities)

labeled_text_tagged = read_file(labeled_chapter)

import re


def separate_words(text):
    # Use regular expression to find words and punctuation, including special words like <PER> and </PERS>
    words = re.findall(r'\b\w+\b|[.,;!?<>/]+|<PER>|</PER>', text)
    return words

def merge_special_words(word_list):
    merged_list = []
    current_word = ""

    for word in word_list:
        if word in ['<', 'PER', '</', '>']:
            current_word += word
        else:
            if current_word:
                merged_list.append(current_word)
                current_word = ""
            merged_list.append(word)

    if current_word:
        merged_list.append(current_word)

    return merged_list

def get_positions_of_entities(text):
    words = separate_words(text)
    words = merge_special_words(words)
    positions = {}
    current_entity = []
    current_entity_start = 0
    current_entity_end = 0
    i = 0
    for word in words:
        if word == "<PER>":
            current_entity = []
            current_entity_start = i
        elif word == "</PER>":
            current_entity_end = i
            positions[(current_entity_start, current_entity_end)] = ' '.join(current_entity)
        else:
            i += 1
            current_entity.append(word)
    return positions

In [62]:
entities_raw = get_positions_of_entities(labeled_text_tagged)
entities = []
for entity in entities_raw:
    if entities_raw[entity] not in entities:
        entities.append({"word": entities_raw[entity]})
print(entities)

[{'word': 'CLÉON Ier'}, {'word': 'Empereur'}, {'word': 'Hari Seldon'}, {'word': 'Seldon'}, {'word': 'Cléon'}, {'word': 'Seldon'}, {'word': 'Cléon'}, {'word': 'Eto Demerzel'}, {'word': 'Cléon'}, {'word': 'Cléon'}, {'word': 'Demerzel'}, {'word': 'Hari Seldon'}, {'word': 'Cléon'}, {'word': 'Hari Seldon'}, {'word': 'Demerzel'}, {'word': 'Demerzel'}, {'word': 'Sire'}, {'word': 'Sire'}, {'word': 'Sire'}, {'word': 'Demerzel'}, {'word': 'l Empereur'}, {'word': 'Demerzel'}, {'word': 'Sire'}, {'word': 'Demerzel'}, {'word': 'Lieutenant Alban Wellis'}, {'word': 'Wellis'}, {'word': 'Seldon'}, {'word': 'l Empereur'}, {'word': 'Seldon'}, {'word': 'l Empereur'}, {'word': 'l Empereur'}, {'word': 'Seldon'}, {'word': 'Seldon'}, {'word': 'Hari Seldon'}, {'word': 'Seldon'}, {'word': 'Seldon'}, {'word': 'Seldon'}, {'word': 'Hari Seldon'}, {'word': 'Seldon'}, {'word': 'Sire'}, {'word': 'Seldon'}, {'word': 'l empereur Cléon'}, {'word': 'Cléon'}, {'word': 'Seldon'}, {'word': 'L Empereur'}, {'word': 'Seldon'}, 

### Evaluating aliases resolution

In [75]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def calculate_alias_metrics(baseline_aliases, test_aliases):
    # Flatten both baseline and test lists to sets for easy comparison
    baseline_set = {alias for aliases in baseline_aliases for alias in aliases}
    test_set = {alias for aliases in test_aliases for alias in aliases}

    # Calculate true positives, false positives, and false negatives
    true_positives = len(baseline_set.intersection(test_set))
    false_positives = len(test_set.difference(baseline_set))
    false_negatives = len(baseline_set.difference(test_set))

    # Calculate precision, recall, and F1-score
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Calculate accuracy
    accuracy = true_positives / len(baseline_set.union(test_set))

    # Create a dictionary to store the measurements
    measurements = {
        "true_positive": true_positives,
        "false_positive": false_positives,
        "false_negative": false_negatives,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy
    }

    return measurements

In [79]:
file_path = '../../data/test_set/prelude_a_fondation/chapter_1.aliases'

baseline_aliases = []

with open(file_path, 'r') as file:
    for line in file:
        elements = line.strip().split(',')
        baseline_aliases.append(elements)

for alias in baseline_aliases:
    print(alias)

['CLÉON Ier', ' Empereur', ' Cléon', ' Sire', " l'Empereur", " l'empereur Cléon"]
['Hari Seldon', ' Seldon']
['Eto Demerzel', ' Demerzel']
['Lieutenant Alban Wellis', ' Wellis']
['Hummin']


In [77]:
aliases = get_aliases_fuzzy_partial_token(entities, 90)
print("===== Fuzzy done =====")
for alias in aliases:
    print(alias)

results = calculate_alias_metrics(baseline_aliases, aliases)
for metric, value in results.items():
    print(f"{metric.capitalize()}: {value}")

===== Fuzzy done =====
['CLÉON Ier', 'l empereur Cléon', 'Cléon']
['Empereur', 'l Empereur']
['Hari Seldon', 'Seldon']
['Eto Demerzel', 'Demerzel']
['Sire']
['Lieutenant Alban Wellis', 'Wellis']
['Hummin']
True_positive: 5
False_positive: 8
False_negative: 8
Precision: 0.38461538461538464
Recall: 0.38461538461538464
F1_score: 0.38461538461538464
Accuracy: 0.23809523809523808


In [78]:
aliases = get_aliases_fuzzy_partial_token(entities, 90)
print("===== Fuzzy done =====")
for alias in aliases:
    print(alias)

results = calculate_alias_metrics(baseline_aliases, aliases)
for metric, value in results.items():
    print(f"{metric.capitalize()}: {value}")

===== Fuzzy done =====
['CLÉON Ier', 'l empereur Cléon', 'Cléon']
['Empereur', 'l Empereur']
['Hari Seldon', 'Seldon']
['Eto Demerzel', 'Demerzel']
['Sire']
['Lieutenant Alban Wellis', 'Wellis']
['Hummin']
True_positive: 5
False_positive: 8
False_negative: 8
Precision: 0.38461538461538464
Recall: 0.38461538461538464
F1_score: 0.38461538461538464
Accuracy: 0.23809523809523808
