In [12]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import Dataset
import os

In [3]:
# load arrow file
data = Dataset.from_file("data-00000-of-00001.arrow")

# target path for CoNLL files
output_dir = "./data_conll"
os.makedirs(output_dir, exist_ok=True)

tag_mapping = {0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}

def write_to_conll(data_split, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in data_split:
            tokens = example["tokens"]
            tags = [tag_mapping[tag] for tag in example["ner_tags"]]  # use mapping
            for token, tag in zip(tokens, tags):
                f.write(f"{token} {tag}\n")
            f.write("\n")  # sentence separation

In [4]:
# read test file
train_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(train_data, "./data_conll/test.txt")

In [5]:
# read train file
validation_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(validation_data, "./data_conll/train.txt")

In [6]:
# read validation file
test_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(test_data, "./data_conll/validation.txt")

In [3]:
!pip install flair

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from flair.models import SequenceTagger

# Pfad zum gespeicherten Modell
model_path = "best-model.pt"

# Modell laden
tagger = SequenceTagger.load(model_path)

2025-01-19 12:22:21,262 SequenceTagger predicts: Dictionary with 15 tags: O, S-ORG, B-ORG, E-ORG, I-ORG, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [9]:
from flair.datasets import ColumnCorpus

# Gib den Pfad zum Verzeichnis mit deinen CoNLL-Dateien an
data_folder = "Data_fr/test_fr"  # Das Verzeichnis, das die 'test_de.txt' enthält

# Erstelle den ColumnCorpus, indem du die Testdatei angibst
corpus = ColumnCorpus(data_folder, 
                      column_format={0: 'text', 1: 'ner'},
                      test_file="test_fr.txt")

# Nur die ersten 10 Sätze durchlaufen
max_sentences = 10
for i, sentence in enumerate(corpus.test):
    if i >= max_sentences:
        break
    
    # Vorhersagen durchführen
    tagger.predict(sentence)
    
    # Ausgabe des Satzes
    print(sentence)
    
    # Ausgabe der erkannten Entitäten
    for entity in sentence.get_spans('ner'):
        print(f"Entity: {entity.text}, Type: {entity.get_label('ner').value}, Confidence: {entity.score:.4f}")

2025-01-19 12:25:21,893 Reading data from Data_fr/test_fr
2025-01-19 12:25:21,893 Train: None
2025-01-19 12:25:21,893 Dev: None
2025-01-19 12:25:21,893 Test: Data_fr/test_fr/test_fr.txt
Sentence[3]: "Upton Park exempt" → ["Upton Park exempt"/ORG]
Entity: Upton Park exempt, Type: ORG, Confidence: 0.4499
Sentence[4]: "LL Cool J feat" → ["LL Cool J feat"/ORG]
Entity: LL Cool J feat, Type: ORG, Confidence: 0.3830
Sentence[2]: "Clay Regazzoni" → ["Clay Regazzoni"/PER]
Entity: Clay Regazzoni, Type: PER, Confidence: 0.6210
Sentence[6]: "Liste des communes de la HauteSaône" → ["Liste des communes de la HauteSaône"/LOC]
Entity: Liste des communes de la HauteSaône, Type: LOC, Confidence: 0.7816
Sentence[7]: "Il y rencontre le président Bachar elAssad" → ["Il y rencontre le président Bachar elAssad"/ORG]
Entity: Il y rencontre le président Bachar elAssad, Type: ORG, Confidence: 0.4018
Sentence[3]: "Per Johan Axelsson" → ["Axelsson"/PER]
Entity: Axelsson, Type: PER, Confidence: 0.6319
Sentence[5]:

In [14]:
import spacy
from spacy import displacy

# Lade das spaCy Modell
nlp = spacy.load("fr_core_news_sm")  # Für deutsche Sprache

# Beispieltext
text = "Liste des communes de la HauteSaône"

# Text verarbeiten
doc = nlp(text)

# Visualisierung der Entitäten
displacy.render(doc, style="ent", page=True)