In [1]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import Dataset
import os

In [3]:
# load arrow file
data = Dataset.from_file("data-00000-of-00001.arrow")

# target path for CoNLL files
output_dir = "./data_conll"
os.makedirs(output_dir, exist_ok=True)

tag_mapping = {0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}

def write_to_conll(data_split, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in data_split:
            tokens = example["tokens"]
            tags = [tag_mapping[tag] for tag in example["ner_tags"]]  # use mapping
            for token, tag in zip(tokens, tags):
                f.write(f"{token} {tag}\n")
            f.write("\n")  # sentence separation

In [4]:
# read test file
train_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(train_data, "./data_conll/test.txt")

In [5]:
# read train file
validation_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(validation_data, "./data_conll/train.txt")

In [6]:
# read validation file
test_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(test_data, "./data_conll/validation.txt")

In [3]:
!pip install flair

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from flair.models import SequenceTagger

# Pfad zum gespeicherten Modell
model_path = "best-model.pt"

# Modell laden
tagger = SequenceTagger.load(model_path)

2025-01-19 12:28:07,553 SequenceTagger predicts: Dictionary with 15 tags: O, S-ORG, B-ORG, E-ORG, I-ORG, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [5]:
from flair.datasets import ColumnCorpus

# Gib den Pfad zum Verzeichnis mit deinen CoNLL-Dateien an
data_folder = "./Data_it/test_it"  # Das Verzeichnis, das die 'test_de.txt' enthält

# Erstelle den ColumnCorpus, indem du die Testdatei angibst
corpus = ColumnCorpus(data_folder, 
                      column_format={0: 'text', 1: 'ner'},
                      test_file="test_it.txt")

# Nur die ersten 10 Sätze durchlaufen
max_sentences = 10
for i, sentence in enumerate(corpus.test):
    if i >= max_sentences:
        break
    
    # Vorhersagen durchführen
    tagger.predict(sentence)
    
    # Ausgabe des Satzes
    print(sentence)
    
    # Ausgabe der erkannten Entitäten
    for entity in sentence.get_spans('ner'):
        print(f"Entity: {entity.text}, Type: {entity.get_label('ner').value}, Confidence: {entity.score:.4f}")

2025-01-19 12:28:29,342 Reading data from Data_it/test_it
2025-01-19 12:28:29,342 Train: None
2025-01-19 12:28:29,343 Dev: None
2025-01-19 12:28:29,343 Test: Data_it/test_it/test_it.txt
Sentence[2]: "Washington Mystics" → ["Washington Mystics"/ORG]
Entity: Washington Mystics, Type: ORG, Confidence: 0.4670
Sentence[6]: "RINVIA Servizio ferroviario metropolitano di Bari" → ["RINVIA Servizio ferroviario metropolitano di Bari"/ORG]
Entity: RINVIA Servizio ferroviario metropolitano di Bari, Type: ORG, Confidence: 0.4094
Sentence[14]: "Magnus Gustafsson ha battuto in finale 67 4 63 76 5 61 Raemon Sluiter" → ["Magnus Gustafsson"/PER]
Entity: Magnus Gustafsson, Type: PER, Confidence: 0.5961
Sentence[4]: "Iva Majoli secondo turno" → ["Iva Majoli"/PER, "turno"/PER]
Entity: Iva Majoli, Type: PER, Confidence: 0.6181
Entity: turno, Type: PER, Confidence: 0.5268
Sentence[8]: "RINVIA Stazione sperimentale per lindustria delle conserve alimentari" → ["sperimentale per lindustria delle conserve aliment

In [8]:
import spacy
from spacy import displacy

# Lade das spaCy Modell
nlp = spacy.load("it_core_news_sm")  # Für deutsche Sprache

# Beispieltext
text = "Al suo arrivo a Belgrado è stata accolta da molti come uneroina"

# Text verarbeiten
doc = nlp(text)

# Visualisierung der Entitäten
displacy.render(doc, style="ent", page=True)
