In [120]:
import numpy as np
import evaluate
import pickle

from utils import read_jsonl, continuous_string, label_distribution

from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline
from huggingface_hub import notebook_login
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split

In [121]:
# Function to read NER data
def read_universal_NER(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as infile:
        # Split into lines
        lines = infile.readlines()

        # Define lists to store data 
        sentences = []
        labels = []
        current_sentence = []
        current_labels = []

        # Iterate over lines
        for line in lines:

            line = line.strip() # Remove whitespace
            if not line: # Skip empty lines
                continue

            # Check if line starts with sentence ID
            if line.startswith('# sent_id'):
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                current_sentence = []
                current_labels = []

            # Check for token lines
            elif not line.startswith("#"):
                parts = line.strip().split('\t')
                current_sentence.append(parts[1])
                current_labels.append(parts[2])

        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)

    # Flatten lists
    #sentences = sum(sentences, [])
    #labels = sum(labels, [])

    return sentences, labels

In [194]:
test_sents, test_labels = read_universal_NER('train_dev_test_sets/test.iob2')
train_sents, train_labels = read_universal_NER('train_dev_test_sets/train.iob2')
dev_sents, dev_labels = read_universal_NER('train_dev_test_sets/dev.iob2')

In [264]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [265]:
# Helper function to process data
def preprocess_data(sents, labels):
    # Flatten the list of lists
    flat_data = [word for sublist in sents for word in sublist]
    flat_labels = [label for sublist in labels for label in sublist]

    # Join the words to form a single string
    text = ' '.join(flat_data)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Split the sentences into lists of words
    split_sentences = [word_tokenize(sentence) for sentence in sentences]

    # Align labels to the split sentences
    split_labels = []
    label_index = 0
    for sentence in split_sentences:
        sentence_labels = []
        for _ in sentence:
            if label_index < len(flat_labels):
                sentence_labels.append(flat_labels[label_index])
                label_index += 1
        split_labels.append(sentence_labels)
    
    return split_sentences, split_labels

In [266]:
processed_test_sents, processed_test_labels = preprocess_data(test_sents, test_labels)
processed_train_sents, processed_train_labels = preprocess_data(train_sents, train_labels)
processed_dev_sents, processed_dev_labels = preprocess_data(dev_sents, dev_labels)

In [212]:
# # Flatten the list of lists to get a single list of words
# flat_test_data = [word for sublist in test_sents for word in sublist]
# flat_test_labels = [label for sublist in test_labels for label in sublist]

# # Join the words to form a single string for sentence tokenization
# text = ' '.join(flat_test_data)

# # Tokenize the text into sentences
# sentences = sent_tokenize(text)

# # Split the sentences into lists of words
# test_sents_split = [sentence.split() for sentence in sentences]

In [213]:
# # Flatten the list of lists to get a single list of words
# flat_train_data = [word for sublist in train_sents for word in sublist]
# flat_train_labels = [label for sublist in train_labels for label in sublist]

# # Join the words to form a single string for sentence tokenization
# text = ' '.join(flat_train_data)

# # Tokenize the text into sentences
# sentences = sent_tokenize(text)

# # Split the sentences into lists of words
# train_sents_split = [sentence.split() for sentence in sentences]

In [214]:
# # Flatten the list of lists to get a single list of words
# flat_dev_data = [word for sublist in dev_sents for word in sublist]
# flat_dev_labels = [label for sublist in dev_labels for label in sublist]

# # Join the words to form a single string for sentence tokenization
# text = ' '.join(flat_dev_data)

# # Tokenize the text into sentences
# sentences = sent_tokenize(text)

# # Split the sentences into lists of words
# dev_sents_split = [sentence.split() for sentence in sentences]

In [215]:
# test_split_labels = []
# label_index = 0
# for sentence in test_sents_split:
#     sentence_labels = []
#     for _ in sentence:
#         if label_index < len(flat_test_labels):
#             sentence_labels.append(flat_test_labels[label_index])
#             label_index += 1
#     test_split_labels.append(sentence_labels)

In [216]:
# train_split_labels = []
# label_index = 0
# for sentence in train_sents_split:
#     sentence_labels = []
#     for _ in sentence:
#         if label_index < len(flat_train_labels):
#             sentence_labels.append(flat_train_labels[label_index])
#             label_index += 1
#     train_split_labels.append(sentence_labels)

In [217]:
# dev_split_labels = []
# label_index = 0
# for sentence in dev_sents_split:
#     sentence_labels = []
#     for _ in sentence:
#         if label_index < len(flat_dev_labels):
#             sentence_labels.append(flat_dev_labels[label_index])
#             label_index += 1
#     dev_split_labels.append(sentence_labels)

In [271]:
print(processed_test_sents[0])
print(processed_test_labels[0])

['A', 'year', 'is', 'an', 'awful', 'long', 'time', 'in', 'football', ',', 'and', 'that', 'has', 'certainly', 'proved', 'the', 'case', 'for', 'Folarin', 'Balogun', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O']


In [198]:
# def split_list_and_labels(strings, labels, num_sublists):
#     # Initialize the sublists for strings and labels
#     sublists_strings = [[] for _ in range(num_sublists)]
#     sublists_labels = [[] for _ in range(num_sublists)]
    
#     # Distribute the strings and labels into sublists
#     for i, (string, label) in enumerate(zip(strings, labels)):
#         sublists_strings[i % num_sublists].append(string)
#         sublists_labels[i % num_sublists].append(label)
    
#     return sublists_strings, sublists_labels

In [199]:
# num_sublists = 800
# test_sents_split, test_labels_split = split_list_and_labels(test_sents[0], test_labels[0], num_sublists)
# train_sents_split, train_labels_split = split_list_and_labels(train_sents[0], train_labels[0], num_sublists)
# dev_sents_split, dev_labels_split = split_list_and_labels(dev_sents[0], dev_labels[0], num_sublists)

In [272]:
# Label Mappng
label2id = {
 'O': 0,
 'B-LOC': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'I-ORG': 6,
 'B-TIM': 7,
 'I-TIM': 8,
 'B-MON': 9,
 'I-MON': 10,
 'B-DAT': 11,
 'I-DAT': 12,
 'B-PCT': 13,
 'I-PCT': 14,
}

id2label = {v: k for k, v in label2id.items()}

In [273]:
label_names = [key for key in label2id.keys()]

In [274]:
def labels_map2id(labels, mapping):
    label_ids = []
    for sent in labels:
        label_id = [mapping.get(label, label) for label in sent]
        label_ids.append(label_id)
    return label_ids

In [282]:
train_label_ids = labels_map2id(processed_train_labels, label2id)
dev_label_ids = labels_map2id(processed_dev_labels, label2id)
test_label_ids = labels_map2id(processed_test_labels, label2id)

In [232]:
print(train_label_ids[0])
print(train_split_labels[0])


[5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 5, 0]
['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-ORG', 'O']


In [276]:
# Create datasets
train_dataset = Dataset.from_dict({"tokens": processed_train_sents, "ner_tags": train_label_ids})
dev_dataset = Dataset.from_dict({"tokens": processed_dev_sents, "ner_tags": dev_label_ids})
test_dataset = Dataset.from_dict({"tokens": processed_test_sents, "ner_tags": test_label_ids})

In [277]:
raw_datasets = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset,
})

# Print to verify
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1586
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 446
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 478
    })
})


In [278]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [234]:
#inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words = True, truncation = True)

In [233]:
#inputs.tokens()

In [236]:
#inputs.word_ids()

In [279]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [292]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


### ÆNDRER ????

In [293]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/1586 [00:00<?, ? examples/s]

IndexError: list index out of range

In [239]:
tokenized_datasets["test"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 478
})

In [240]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [241]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [242]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [243]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [244]:
args = TrainingArguments(
    "bert-finetuned-ner-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [245]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/597 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.08439277857542038, 'eval_precision': 0.875177304964539, 'eval_recall': 0.9046920821114369, 'eval_f1': 0.8896899783705841, 'eval_accuracy': 0.9778323233872823, 'eval_runtime': 3.2035, 'eval_samples_per_second': 139.224, 'eval_steps_per_second': 17.481, 'epoch': 1.0}


  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.06450602412223816, 'eval_precision': 0.9194823867721064, 'eval_recall': 0.9376832844574781, 'eval_f1': 0.9284936479128856, 'eval_accuracy': 0.9841988187466442, 'eval_runtime': 1.7732, 'eval_samples_per_second': 251.521, 'eval_steps_per_second': 31.581, 'epoch': 2.0}
{'loss': 0.1632, 'grad_norm': 0.39681103825569153, 'learning_rate': 3.2495812395309884e-06, 'epoch': 2.51}


  0%|          | 0/56 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.06387829780578613, 'eval_precision': 0.9284682080924855, 'eval_recall': 0.9420821114369502, 'eval_f1': 0.9352256186317323, 'eval_accuracy': 0.9851959806703996, 'eval_runtime': 1.7597, 'eval_samples_per_second': 253.451, 'eval_steps_per_second': 31.823, 'epoch': 3.0}
{'train_runtime': 138.3266, 'train_samples_per_second': 34.397, 'train_steps_per_second': 4.316, 'train_loss': 0.14283090620184663, 'epoch': 3.0}


TrainOutput(global_step=597, training_loss=0.14283090620184663, metrics={'train_runtime': 138.3266, 'train_samples_per_second': 34.397, 'train_steps_per_second': 4.316, 'total_flos': 66586974996204.0, 'train_loss': 0.14283090620184663, 'epoch': 3.0})

In [246]:
trainer.push_to_hub(commit_message="Training complete 3")

CommitInfo(commit_url='https://huggingface.co/ChristianSneffeFleischer/bert-finetuned-ner-2/commit/23249e4cd671ba9a2df688f9fecaeb4931c1bc74', commit_message='Training complete 3', commit_description='', oid='23249e4cd671ba9a2df688f9fecaeb4931c1bc74', pr_url=None, pr_revision=None, pr_num=None)

In [247]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.06387829780578613, 'eval_precision': 0.9284682080924855, 'eval_recall': 0.9420821114369502, 'eval_f1': 0.9352256186317323, 'eval_accuracy': 0.9851959806703996, 'eval_runtime': 1.8961, 'eval_samples_per_second': 235.226, 'eval_steps_per_second': 29.535, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [248]:
# Make predictions on the test set
predictions, labels, metrics = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/60 [00:00<?, ?it/s]

In [249]:
# Convert predictions to label indices
predictions = np.argmax(predictions, axis=-1)

In [250]:
# Ensure predictions is a list of lists
if isinstance(predictions, np.ndarray):
    if predictions.ndim == 1:
        predictions = predictions.tolist()
    else:
        predictions = [pred.tolist() for pred in predictions]

# Convert label indices to label names
true_predictions = [
    [id2label[p] for p in prediction]
    for prediction in predictions
]

In [255]:
# Function to write predictions
def write_iob2_predictions(test_sents, predictions, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for i, (sent, prediction) in enumerate(zip(test_sents, predictions)):
            outfile.write(f"# sent_id = test-{i+1:04d}\n")
            outfile.write(f"# text = {' '.join(sent)}\n")
            for j, (word, label) in enumerate(zip(sent, prediction)):
                outfile.write(f"{j+1}\t{word}\t{label}\t-\t-\n")
            outfile.write("\n")

In [254]:
# Write the shifted predictions to a file
write_iob2_predictions(processed_test_sents, true_predictions, 'test_predictions_our_data.iob2')

________________________________________________________________

In [3]:
# Testing

model_checkpoint = "ChristianSneffeFleischer/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("In Argentina , beef is revered , respected , and praised .")



[{'entity_group': 'LOC',
  'score': 0.99479514,
  'word': 'Argentina',
  'start': 3,
  'end': 12}]