In [2]:
from time import time
import pandas as pd
from datasets import load_dataset

# Use Flair Benchmark Model
from flair.data import Sentence
from flair.models import SequenceTagger

In [16]:
# Test Load dataset
# !python -c "from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])"

In [17]:
# Load Conll2003 using datasets library
conll_train_ds = load_dataset("Rosenberg/conll2003", split="train")
conll_test_ds = load_dataset("Rosenberg/conll2003", split="test")
conll_valid_ds = load_dataset("Rosenberg/conll2003", split="validation")

# to Pandas DataFrame
conll_train_ds.set_format("pandas")
conll_test_ds.set_format("pandas")
conll_valid_ds.set_format("pandas")

conll_train_raw = conll_train_ds[:]
conll_test_raw = conll_test_ds[:]
conll_valid_raw = conll_valid_ds[:]

# Save to CSV
conll_train_raw.to_csv("../data/original/conll_train.csv", index=False)
conll_test_raw.to_csv("../data/original/conll_test.csv", index=False)
conll_valid_raw.to_csv("../data/original/conll_valid.csv", index=False)

In [18]:
def load_conll_data(file_path):
    sentences = []
    sentence = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                if line.startswith("#"):  # Ignore metadata
                    continue
                parts = line.split()
                if len(parts) == 4:  # Ensure valid structure
                    word, _, _, label = parts
                    sentence.append((word, label))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []

    if sentence:
        sentences.append(sentence)  # Add last sentence if exists
    return sentences

# Load  dataset
extended_conll = load_conll_data("../data/extended/en_train.conll")

In [None]:
print(extended_conll[:5])

#find unique labels
unique_labels = set()
for sentence in extended_conll:
    for _, label in sentence:
        unique_labels.add(label)

print(unique_labels)

[[('robert', 'B-OtherPER'), ('gottschalk', 'I-OtherPER'), ('1939', 'O'), ('academy', 'B-VisualWork'), ('award', 'I-VisualWork'), ('winner', 'O'), ('and', 'O'), ('founder', 'O'), ('of', 'O'), ('panavision', 'B-ORG')], [('during', 'O'), ('the', 'O'), ('reign', 'O'), ('of', 'O'), ('the', 'O'), ('tongzhi', 'B-OtherPER'), ('emperor', 'I-OtherPER'), ('(', 'O'), ('r', 'O'), ('.', 'O'), ('1861', 'O'), ('–', 'O'), ('1875', 'O'), (')', 'O'), (':', 'O')], [('further', 'O'), ('research', 'O'), ('led', 'O'), ('in', 'O'), ('the', 'O'), ('1960s', 'O'), ('to', 'O'), ('the', 'O'), ('bahadur', 'B-OtherPER'), ('representation', 'O'), ('which', 'O'), ('provides', 'O'), ('information', 'O'), ('about', 'O'), ('the', 'O'), ('errorbounds', 'O'), ('.', 'O')], [('the', 'O'), ('ideas', 'O'), ('were', 'O'), ('introduced', 'O'), ('by', 'O'), ('william', 'B-OtherPER'), ('burnside', 'I-OtherPER'), ('at', 'O'), ('the', 'O'), ('end', 'O'), ('of', 'O'), ('the', 'O'), ('nineteenth', 'O'), ('century', 'O'), ('.', 'O')], 

In [23]:
model_load_time = time()

# Load the NER model
tagger = SequenceTagger.load("flair/ner-english-large")
print(f"Model loaded in {time() - model_load_time:.2f} seconds")

tagger.save("../models/ner-large.pt")

pytorch_model.bin:   3%|2         | 62.9M/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2025-02-27 23:52:41,065 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Model loaded in 938.94 seconds


In [29]:
tagger = SequenceTagger.load("../models/ner-large.pt")

# Test the model
sentence = Sentence("Ko Khant and Winn is sad about Donald Trump and Elon's DOGE putting DEI operations to stop. They also stopped the operation of USAID")
tagger.predict(sentence)
print(sentence.to_tagged_string())


2025-02-28 13:00:42,849 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence[26]: "Ko Khant and Winn is sad about Donald Trump and Elon's DOGE putting DEI operations to stop. They also stopped the operation of USAID" → ["Ko Khant and Winn"/PER, "Donald Trump"/PER, "Elon"/PER, "DOGE"/ORG, "DEI"/ORG, "USAID"/ORG]
