In [13]:
import numpy as np
import pandas as pd
import evaluate
import pickle

from utils import read_jsonl, continuous_string, label_distribution

from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline, DistilBertTokenizerFast
from huggingface_hub import notebook_login
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Function to read NER data
def read_universal_NER(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as infile:
        # Split into lines
        lines = infile.readlines()

        # Define lists to store data 
        sentences = []
        labels = []
        current_sentence = []
        current_labels = []

        # Iterate over lines
        for line in lines:

            line = line.strip() # Remove whitespace
            if not line: # Skip empty lines
                continue

            # Check if line starts with sentence ID
            if line.startswith('# sent_id'):
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                current_sentence = []
                current_labels = []

            # Check for token lines
            elif not line.startswith("#"):
                parts = line.strip().split('\t')
                current_sentence.append(parts[1])
                current_labels.append(parts[2])

        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)

    # Flatten lists
    #sentences = sum(sentences, [])
    #labels = sum(labels, [])

    return sentences, labels

In [16]:
with open('train_dev_test_sets/train_raw.pkl', 'rb') as infile:
    train = pickle.load(infile)

with open('train_dev_test_sets/dev_raw.pkl', 'rb') as infile:
    dev = pickle.load(infile)

with open('train_dev_test_sets/test_raw.pkl', 'rb') as infile:
    test = pickle.load(infile)

In [17]:
def format_gold_to_iob2_bert(gold_data):
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased') # Define model for tokenization
    output = []
    line_idx = 1

    for document in gold_data:
        text = document['text']
        
        encoded = tokenizer(text, return_offsets_mapping = True) # Tokenize and get offsets
        tokens = tokenizer.convert_ids_to_tokens(encoded.input_ids) # Get token texts
        offsets = encoded.offset_mapping

        labels = sorted(document['label'], key = lambda x: x[0])  # Sort labels by start index

        # Prepare the tags
        tags = ['O'] * len(tokens)

        # Apply IOB tagging
        for start, end, label in labels:
            started = False
            for i, (token_start, token_end) in enumerate(offsets):
                if token_start >= start and token_end <= end:
                    if token_start == start or not started:
                        tags[i] = f'B-{label}'
                        started = True
                    else:
                        tags[i] = f'I-{label}'

        # Generate the formatted output
        for token_idx, token_text in enumerate(tokens):
            # Don't label start and end tokens
            if token_text == '[CLS]' or token_text == '[SEP]':
                output.append(f"{line_idx}\t{token_text}\tO\t-\t-")
            else:
                output.append(f"{line_idx}\t{token_text}\t{tags[token_idx]}\t-\t-")
            line_idx += 1

    output.append('\n')
    return "\n".join(output)

In [18]:
# Convert train, dev, and test to iob2
iob2_sets = {
    'train': format_gold_to_iob2_bert(train),
    'dev': format_gold_to_iob2_bert(dev),
    'test': format_gold_to_iob2_bert(test)
}

# Save sets to disk
for name, iob2_set in iob2_sets.items():
    with open(f'train_dev_test_sets/{name}_for_baseline.iob2', 'w', encoding = 'utf-8') as outfile:
        for line in iob2_set:
            outfile.write(line)

Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1102 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1037 > 512). Running this sequence through the model will result in indexing errors


In [19]:
train_sents, train_labels = read_universal_NER("train_dev_test_sets/train_for_baseline.iob2")
dev_sents, dev_labels = read_universal_NER("train_dev_test_sets/dev_for_baseline.iob2")
test_sents, test_labels = read_universal_NER("train_dev_test_sets/test_for_baseline.iob2")

In [20]:
# Helper function to process data
def preprocess_data(sents, labels):
    # Flatten the list of lists
    flat_data = [word for sublist in sents for word in sublist]
    flat_labels = [label for sublist in labels for label in sublist]

    # Join the words to form a single string
    text = ' '.join(flat_data)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Split the sentences into lists of tokens
    split_sentences = [sentence.split() for sentence in sentences]

    # Align labels to the split sentences
    split_labels = []
    label_index = 0
    for sentence in split_sentences:
        sentence_labels = []
        for _ in sentence:
            if label_index < len(flat_labels):
                sentence_labels.append(flat_labels[label_index])
                label_index += 1
        split_labels.append(sentence_labels)
    
    return split_sentences, split_labels

In [6]:
processed_train_sents, processed_train_labels = preprocess_data(train_sents, train_labels)
processed_dev_sents, processed_dev_labels = preprocess_data(dev_sents, dev_labels)
processed_test_sents, processed_test_labels = preprocess_data(test_sents, test_labels)

In [21]:
print(processed_test_sents[0])
print(processed_test_labels[0])
    

['[CLS]', 'A', 'year', 'is', 'an', 'awful', 'long', 'time', 'in', 'football', ',', 'and', 'that', 'has', 'certainly', 'proved', 'the', 'case', 'for', 'F', '##olar', '##in', 'Ba', '##log', '##un', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O']


In [22]:
# Label Mappng
label2id = {
 'O': 0,
 'B-LOC': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'I-ORG': 6,
 'B-TIM': 7,
 'I-TIM': 8,
 'B-MON': 9,
 'I-MON': 10,
 'B-DAT': 11,
 'I-DAT': 12,
 'B-PCT': 13,
 'I-PCT': 14,
}

id2label = {v: k for k, v in label2id.items()}

In [23]:
label_names = [key for key in label2id.keys()]

In [24]:
def labels_map2id(labels, mapping):
    label_ids = []
    for sent in labels:
        label_id = [mapping.get(label, label) for label in sent]
        label_ids.append(label_id)
    return label_ids

In [25]:
train_label_ids = labels_map2id(processed_train_labels, label2id)
dev_label_ids = labels_map2id(processed_dev_labels, label2id)
test_label_ids = labels_map2id(processed_test_labels, label2id)

In [26]:
# Create datasets
train_dataset = Dataset.from_dict({"tokens": processed_train_sents, "ner_tags": train_label_ids})
dev_dataset = Dataset.from_dict({"tokens": processed_dev_sents, "ner_tags": dev_label_ids})
test_dataset = Dataset.from_dict({"tokens": processed_test_sents, "ner_tags": test_label_ids})

print(test_dataset[0])

{'tokens': ['[CLS]', 'A', 'year', 'is', 'an', 'awful', 'long', 'time', 'in', 'football', ',', 'and', 'that', 'has', 'certainly', 'proved', 'the', 'case', 'for', 'F', '##olar', '##in', 'Ba', '##log', '##un', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0]}


In [27]:
raw_datasets = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset
})

# Print to verify
print(raw_datasets['test'][0])

{'tokens': ['[CLS]', 'A', 'year', 'is', 'an', 'awful', 'long', 'time', 'in', 'football', ',', 'and', 'that', 'has', 'certainly', 'proved', 'the', 'case', 'for', 'F', '##olar', '##in', 'Ba', '##log', '##un', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0]}


In [28]:
model_name = 'distilbert-base-cased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [29]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [30]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


### ÆNDRER ????

In [31]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/1586 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/447 [00:00<?, ? examples/s]

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

In [32]:
print(tokenized_datasets["test"][0])

{'input_ids': [101, 101, 138, 1214, 1110, 1126, 9684, 1263, 1159, 1107, 1709, 117, 1105, 1115, 1144, 4664, 4132, 1103, 1692, 1111, 143, 108, 108, 184, 5815, 108, 108, 1107, 18757, 108, 108, 9366, 108, 108, 8362, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, -100]}


In [33]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [34]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [35]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
args = TrainingArguments(
    "bert-finetuned-ner-football",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [38]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/597 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.18451999127864838, 'eval_precision': 0.7498314227916386, 'eval_recall': 0.8152492668621701, 'eval_f1': 0.7811731647348087, 'eval_accuracy': 0.9446270981138605, 'eval_runtime': 4.355, 'eval_samples_per_second': 102.64, 'eval_steps_per_second': 12.859, 'epoch': 1.0}


  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.14158174395561218, 'eval_precision': 0.8381147540983607, 'eval_recall': 0.8995601173020528, 'eval_f1': 0.8677510608203678, 'eval_accuracy': 0.9624502509084617, 'eval_runtime': 2.3243, 'eval_samples_per_second': 192.314, 'eval_steps_per_second': 24.093, 'epoch': 2.0}
{'loss': 0.2393, 'grad_norm': 0.25545579195022583, 'learning_rate': 3.2495812395309884e-06, 'epoch': 2.51}


  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.13512377440929413, 'eval_precision': 0.8589211618257261, 'eval_recall': 0.9105571847507331, 'eval_f1': 0.8839857651245552, 'eval_accuracy': 0.9651035357905059, 'eval_runtime': 2.3527, 'eval_samples_per_second': 189.998, 'eval_steps_per_second': 23.803, 'epoch': 3.0}
{'train_runtime': 161.6972, 'train_samples_per_second': 29.425, 'train_steps_per_second': 3.692, 'train_loss': 0.2094668766961026, 'epoch': 3.0}


TrainOutput(global_step=597, training_loss=0.2094668766961026, metrics={'train_runtime': 161.6972, 'train_samples_per_second': 29.425, 'train_steps_per_second': 3.692, 'total_flos': 91041183095148.0, 'train_loss': 0.2094668766961026, 'epoch': 3.0})

In [41]:
results = trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/60 [00:00<?, ?it/s]

In [42]:
trainer.push_to_hub(commit_message="test complete")

CommitInfo(commit_url='https://huggingface.co/ChristianSneffeFleischer/bert-finetuned-ner-football/commit/76945c26fb5e0bcdb010d9e92c3d7cdb4041cd7a', commit_message='test complete', commit_description='', oid='76945c26fb5e0bcdb010d9e92c3d7cdb4041cd7a', pr_url=None, pr_revision=None, pr_num=None)

In [297]:
# # Make predictions on the test set
# predictions, labels, metrics = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/60 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [294]:
# def compute_preds(eval_preds):
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)

#     # Remove ignored index (special tokens) and convert to labels
#     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
#     true_predictions = [
#         [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     return true_labels

In [298]:
# compute_preds((predictions, labels))

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',

In [286]:
# # Convert predictions to label indices
# predictions = np.argmax(predictions, axis=-1)

In [288]:
# # Function to write predictions
# def write_iob2_predictions(test_sents, predictions, output_file):
#     with open(output_file, 'w', encoding='utf-8') as outfile:
#         for i, (sent, prediction) in enumerate(zip(test_sents, predictions)):
#             outfile.write(f"# sent_id = test-{i+1:04d}\n")
#             outfile.write(f"# text = {' '.join(sent)}\n")
#             for j, (word, label) in enumerate(zip(sent, prediction)):
#                 outfile.write(f"{j+1}\t{word}\t{label}\t-\t-\n")
#             outfile.write("\n")

In [289]:
# # Write the shifted predictions to a file
# write_iob2_predictions(processed_test_sents, true_predictions, 'baseline_preds/baseline_preds_football.iob2')

________________________________________________________________

In [43]:
# Testing

model_checkpoint = "ChristianSneffeFleischer/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("In Argentina , beef is revered , respected , and praised .")



config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

[{'entity_group': 'ORG',
  'score': 0.232365,
  'word': 'Argentina',
  'start': 3,
  'end': 12}]