In [2]:
import numpy as np
import evaluate
import pickle

from utils import read_jsonl, continuous_string, label_distribution

from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline
from huggingface_hub import notebook_login
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split

In [13]:
# Function to read NER data
def read_universal_NER(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as infile:
        # Split into lines
        lines = infile.readlines()

        # Define lists to store data 
        sentences = []
        labels = []
        current_sentence = []
        current_labels = []

        # Iterate over lines
        for line in lines:

            line = line.strip() # Remove whitespace
            if not line: # Skip empty lines
                continue

            # Check if line starts with sentence ID
            if line.startswith('# sent_id'):
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                current_sentence = []
                current_labels = []

            # Check for token lines
            elif not line.startswith("#"):
                parts = line.strip().split('\t')
                current_sentence.append(parts[1])
                current_labels.append(parts[2])

        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)

    # Flatten lists
    #sentences = sum(sentences, [])
    #labels = sum(labels, [])

    return sentences, labels

In [15]:
test_sents, test_labels = read_universal_NER('train_dev_test_sets/test.iob2')
train_sents, train_labels = read_universal_NER('train_dev_test_sets/train.iob2')
dev_sents, dev_labels = read_universal_NER('train_dev_test_sets/dev.iob2')

In [114]:
def split_list_and_labels(strings, labels, num_sublists):
    # Initialize the sublists for strings and labels
    sublists_strings = [[] for _ in range(num_sublists)]
    sublists_labels = [[] for _ in range(num_sublists)]
    
    # Distribute the strings and labels into sublists
    for i, (string, label) in enumerate(zip(strings, labels)):
        sublists_strings[i % num_sublists].append(string)
        sublists_labels[i % num_sublists].append(label)
    
    return sublists_strings, sublists_labels

In [115]:
num_sublists = 800
test_sents_split, test_labels_split = split_list_and_labels(test_sents[0], test_labels[0], num_sublists)
train_sents_split, train_labels_split = split_list_and_labels(train_sents[0], train_labels[0], num_sublists)
dev_sents_split, dev_labels_split = split_list_and_labels(dev_sents[0], dev_labels[0], num_sublists)

In [117]:
dev_labels_split

[['O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-ORG',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'I-ORG',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'B-PER'],
 ['O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'B-PER',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-PER',
  'O',
  'B-ORG',
  'O',
  'O',
  'O'],
 ['O',
  'B-PER',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'B-MON',
  'O

In [67]:
# Label Mappng
label2id = {
 'O': 0,
 'B-LOC': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'I-ORG': 6,
 'B-TIM': 7,
 'I-TIM': 8,
 'B-MON': 9,
 'I-MON': 10,
 'B-DAT': 11,
 'I-DAT': 12,
 'B-PCT': 13,
 'I-PCT': 14,
}

id2label = {v: k for k, v in label2id.items()}

In [80]:
label_names = list(label2id.keys())

In [68]:
def labels_map2id(labels, mapping):
    label_ids = []
    for sent in labels:
        label_id = [mapping.get(label, label) for label in sent]
        label_ids.append(label_id)
    return label_ids

In [69]:
train_label_ids = labels_map2id(train_labels_split, label2id)
dev_label_ids = labels_map2id(dev_labels_split, label2id)
test_label_ids = labels_map2id(test_labels_split, label2id)

In [70]:
train_label_ids

[[5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  9,
  0,
  0,
  0,
  0,
  0,
  3,
  5,
  0,
  0,
  4,
  3,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  9,
  4,
  0],
 [6,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  3,
  3,
  0,
  0,
  3,
  10,
  0,
  0,
  0,
  0,
  0,
  4,
  6,
  3,
  0,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  5,
  10,
  0,
  0],
 [6,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  4,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  4,
  0,
  4,
  0,
  0,
  0,
  0,
  0,
  11,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  6,
  0,
  5,
  0],
 [6,
  0,
  0,
  0,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  12,
  0,
  0,
  0,
  5,
  0,
  0,
  6

In [86]:
# Create datasets
train_dataset = Dataset.from_dict({"tokens": train_sents_split, "ner_tags": train_label_ids})
dev_dataset = Dataset.from_dict({"tokens": dev_sents_split, "ner_tags": dev_label_ids})
test_dataset = Dataset.from_dict({"tokens": test_sents_split, "ner_tags": test_label_ids})

In [87]:
raw_datasets = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset,
})

# Print to verify
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 800
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 800
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 800
    })
})


In [74]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [234]:
#inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words = True, truncation = True)

In [233]:
#inputs.tokens()

In [236]:
#inputs.word_ids()

In [88]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [89]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [90]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [91]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
})

In [78]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [81]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [82]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [242]:
# notebook_login()

In [98]:
args = TrainingArguments(
    "bert-finetuned-ner-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [105]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/300 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [100]:
trainer.push_to_hub(commit_message="Training complete 3")

CommitInfo(commit_url='https://huggingface.co/ChristianSneffeFleischer/bert-finetuned-ner-2/commit/5a0b9a3f0c817fe17884daaaa9eb5a5dfdc19ffc', commit_message='Training complete 3', commit_description='', oid='5a0b9a3f0c817fe17884daaaa9eb5a5dfdc19ffc', pr_url=None, pr_revision=None, pr_num=None)

In [106]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 0.22498416900634766, 'eval_precision': 0.7983304042179262, 'eval_recall': 0.8173639226270806, 'eval_f1': 0.8077350522338298, 'eval_accuracy': 0.9270537700391195, 'eval_runtime': 2.6968, 'eval_samples_per_second': 296.648, 'eval_steps_per_second': 37.081, 'epoch': 0.05}
{'eval_loss': 0.22498416900634766, 'eval_precision': 0.7983304042179262, 'eval_recall': 0.8173639226270806, 'eval_f1': 0.8077350522338298, 'eval_accuracy': 0.9270537700391195, 'eval_runtime': 2.6968, 'eval_samples_per_second': 296.648, 'eval_steps_per_second': 37.081, 'epoch': 0.05}


In [109]:
# Assume you have a test dataset
test_dataset = tokenized_datasets["test"]

# Make predictions on the test set
predictions, labels, _ = trainer.predict(test_dataset)

  0%|          | 0/100 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [110]:
def write_iob2_predictions_shifted(test_sents, predictions, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for i, (sent, prediction) in enumerate(zip(test_sents, predictions)):
            outfile.write(f"# sent_id = test-{i+1:04d}\n")
            outfile.write(f"# text = {' '.join(sent)}\n")
            # Shift predictions up by one position
            shifted_predictions = prediction[1:] + ["O"]
            for j, (word, label) in enumerate(zip(sent, shifted_predictions)):
                outfile.write(f"{j+1}\t{word}\t{label}\t-\t-\n")
            outfile.write("\n")

In [111]:
# Convert predictions to label indices
predictions = np.argmax(predictions, axis=-1)

In [112]:
# Convert label indices to label names
true_predictions = [
    [id2label[p] for p in prediction]
    for prediction in predictions
]

# Write the shifted predictions to a file
write_iob2_predictions_shifted(test_sents, true_predictions, 'test_predictions_our_data.iob2')

________________________________________________________________

In [3]:
# Testing

model_checkpoint = "ChristianSneffeFleischer/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("In Argentina , beef is revered , respected , and praised .")



[{'entity_group': 'LOC',
  'score': 0.99479514,
  'word': 'Argentina',
  'start': 3,
  'end': 12}]