In [1]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import Dataset
import os

In [3]:
# load arrow file
data = Dataset.from_file("data-00000-of-00001.arrow")

# target path for CoNLL files
output_dir = "./data_conll"
os.makedirs(output_dir, exist_ok=True)

tag_mapping = {0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}

def write_to_conll(data_split, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in data_split:
            tokens = example["tokens"]
            tags = [tag_mapping[tag] for tag in example["ner_tags"]]  # use mapping
            for token, tag in zip(tokens, tags):
                f.write(f"{token} {tag}\n")
            f.write("\n")  # sentence separation

In [4]:
# read test file
train_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(train_data, "./data_conll/test.txt")

In [5]:
# read train file
validation_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(validation_data, "./data_conll/train.txt")

In [6]:
# read validation file
test_data = Dataset.from_file("data-00000-of-00001.arrow")
write_to_conll(test_data, "./data_conll/validation.txt")

In [3]:
!pip install flair

Defaulting to user installation because normal site-packages is not writeable
Collecting flair
  Downloading flair-0.15.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.36.2-py3-none-any.whl.metadata (6.6 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair)
  Downloading Deprecated-1.2.15-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting gdown>=4.4.0 (from flair)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting mpld3>=0.3 (from flair)
  Downloadi

In [4]:
from flair.models import SequenceTagger

# Pfad zum gespeicherten Modell
model_path = "best-model.pt"

# Modell laden
tagger = SequenceTagger.load(model_path)


2025-01-19 11:29:27,170 SequenceTagger predicts: Dictionary with 15 tags: O, S-ORG, B-ORG, E-ORG, I-ORG, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [7]:
from flair.datasets import ColumnCorpus

# Gib den Pfad zum Verzeichnis mit deinen CoNLL-Dateien an
data_folder = "./Data_de/test_de"  # Das Verzeichnis, das die 'test_de.txt' enthält

# Erstelle den ColumnCorpus, indem du die Testdatei angibst
corpus = ColumnCorpus(data_folder, 
                      column_format={0: 'text', 1: 'ner'},
                      test_file="test_de.txt")

# Nur die ersten 10 Sätze durchlaufen
max_sentences = 10
for i, sentence in enumerate(corpus.test):
    if i >= max_sentences:
        break
    
    # Vorhersagen durchführen
    tagger.predict(sentence)
    
    # Ausgabe des Satzes
    print(sentence)
    
    # Ausgabe der erkannten Entitäten
    for entity in sentence.get_spans('ner'):
        print(f"Entity: {entity.text}, Type: {entity.get_label('ner').value}, Confidence: {entity.score:.4f}")


2025-01-19 11:33:21,116 Reading data from Data_de/test_de
2025-01-19 11:33:21,117 Train: None
2025-01-19 11:33:21,117 Dev: None
2025-01-19 11:33:21,117 Test: Data_de/test_de/test_de.txt
Sentence[3]: "WEITERLEITUNG Hu Xian" → ["WEITERLEITUNG Hu Xian"/PER]
Entity: WEITERLEITUNG Hu Xian, Type: PER, Confidence: 0.5638
Sentence[4]: "Katja Kipping Bernd Riexinger" → ["Katja Kipping"/PER, "Riexinger"/PER]
Entity: Katja Kipping, Type: PER, Confidence: 0.7049
Entity: Riexinger, Type: PER, Confidence: 0.5375
Sentence[12]: "Runde rammte Lorenzo Bandini an zweiter Stelle liegend die Streckenbegrenzung aus Strohballen" → ["Runde rammte Lorenzo Bandini"/PER, "zweiter Stelle"/PER]
Entity: Runde rammte Lorenzo Bandini, Type: PER, Confidence: 0.5897
Entity: zweiter Stelle, Type: PER, Confidence: 0.6117
Sentence[17]: "Im Doppel waren Marcelo Melo und André Sá die Titelverteidiger sie schieden in der ersten Runde aus" → ["Im Doppel"/PER, "Marcelo Melo"/PER, "André Sá"/PER, "Titelverteidiger sie"/PER, "de

In [8]:
pip install matplotlib


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:
import spacy
from spacy import displacy

# Lade das spaCy Modell
nlp = spacy.load("de_core_news_sm")  # Für deutsche Sprache

# Beispieltext
text = "Das Cover zeigt sie zusammen mit Kate Moss und Gisele Bündchen."

# Text verarbeiten
doc = nlp(text)

# Visualisierung der Entitäten
displacy.render(doc, style="ent", page=True)
