In [6]:
import numpy as np
import evaluate

from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline
from huggingface_hub import notebook_login
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split

In [7]:
# Function to read NER data
def read_universal_NER(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as infile:
        # Split into lines
        lines = infile.readlines()

        # Define lists to store data 
        sentences = []
        labels = []
        current_sentence = []
        current_labels = []

        # Iterate over lines
        for line in lines:

            line = line.strip() # Remove whitespace
            if not line: # Skip empty lines
                continue

            # Check if line starts with sentence ID
            if line.startswith('# sent_id'):
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                current_sentence = []
                current_labels = []

            # Check for token lines
            elif not line.startswith("#"):
                parts = line.strip().split('\t')
                current_sentence.append(parts[1])
                current_labels.append(parts[2])

        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)

    # Flatten lists
    #sentences = sum(sentences, [])
    #labels = sum(labels, [])

    return sentences, labels

In [8]:
# Read Date
test_sents, test_labels = read_universal_NER('en_ewt-ud-test.iob2')
train_sents, train_labels = read_universal_NER('en_ewt-ud-train.iob2')

# training and validation
train_data = list(zip(train_sents, train_labels))
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Unzip the training and validation data
train_sents, train_labels = zip(*train_data)
validation_sents, validation_labels = zip(*validation_data)

In [24]:
train_labels

(['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O'],
 ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O'],
 ['B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O

In [9]:
# Label Mappng
label2id = {
 'O': 0,
 'B-LOC': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'I-ORG': 6,
}
id2label = {v: k for k, v in label2id.items()}

In [10]:
def labels_map2id(labels, mapping):
    label_ids = []
    for sent in labels:
        label_id = [mapping[label] for label in sent]
        label_ids.append(label_id)
    return label_ids

In [11]:
train_label_ids = labels_map2id(train_labels, label2id)
validation_label_ids = labels_map2id(validation_labels, label2id)
test_label_ids = labels_map2id(test_labels, label2id)

In [25]:
train_label_ids

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 0],
 [0, 3, 0, 0, 0, 0, 0],
 [0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0],
 [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [26]:
train_sents

(['GOOD',
  'NEWS',
  '...',
  'I',
  'found',
  'a',
  'web',
  'page',
  'run',
  'by',
  'Canadian',
  'Immigration',
  'Lawyers',
  'at',
  'http://www.canadavisa.com/canadian-immigration-faq-skilled-workers.html',
  '.'],
 ['already', 'have', 'plans', 'on', 'thursday', '.'],
 ['My',
  'experience',
  'was',
  'amazing',
  'at',
  'Providence',
  'Aesthetics',
  'and',
  'Medical',
  'Spa',
  '.'],
 ['Dr.', 'Shady', 'is', 'inexperienced', 'and', 'prideful', '.'],
 ['The',
  'IPN',
  'report',
  'examines',
  'commercial',
  'fish',
  'species',
  ',',
  'but',
  'it',
  'neglects',
  'important',
  'non-commercial',
  'animals',
  ',',
  'such',
  'as',
  'seals',
  'and',
  'polar',
  'bears',
  '.'],
 ['Wolf',
  'reintroduction',
  'in',
  'the',
  'northern',
  'Rockies',
  'has',
  'been',
  'so',
  'successful',
  'the',
  'Fish',
  'and',
  'Wildlife',
  'Service',
  'has',
  'moved',
  'to',
  'reduce',
  'the',
  'animal',
  "'s",
  'status',
  'from',
  'endangered',
  'to

In [28]:
test_sents

[['What', 'is', 'this', 'Miramar', '?'],
 ['It', 'is', 'a', 'place', 'in', 'Argentina', 'lol'],
 ['what',
  'is',
  'a',
  'good',
  'slogan',
  'for',
  'an',
  'Argentinian',
  'restaurant',
  '?'],
 ['"',
  'In',
  'Argentina',
  ',',
  'beef',
  'is',
  'revered',
  ',',
  'respected',
  ',',
  'and',
  'praised',
  '.'],
 ['Come', 'see', 'how', 'we', 'continue', 'this', 'tradition', '.', '"'],
 ['A', 'taste', 'of', 'Argentina', '.'],
 ['Here',
  'are',
  'some',
  'articles',
  'that',
  'discuss',
  'the',
  'details',
  'of',
  'slogan',
  'writing',
  '.'],
 ['Why', 'certain', 'slogans', 'work', 'and', 'why', 'some', 'do', "n't", '.'],
 ['You',
  'will',
  'find',
  'these',
  'helpful',
  'in',
  'writing',
  'a',
  'new',
  'slogan',
  '.'],
 ['A',
  'Look',
  'at',
  'Slogans',
  '-',
  'http://www.small-business-software.net/look-at-slogans.htm'],
 ['Unique',
  'Selling',
  'Proposition',
  '-',
  'http://www.small-business-software.net/unique-selling-proposition.htm'],
 ['

In [12]:
# Create datasets
train_dataset = Dataset.from_dict({"tokens": train_sents, "ner_tags": train_label_ids})
validation_dataset = Dataset.from_dict({"tokens": validation_sents, "ner_tags": validation_label_ids})
test_dataset = Dataset.from_dict({"tokens": test_sents, "ner_tags": test_label_ids})

In [13]:
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

# Print to verify
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 10034
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2509
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2077
    })
})


In [14]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [234]:
#inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words = True, truncation = True)

In [233]:
#inputs.tokens()

In [236]:
#inputs.word_ids()

In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [162]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [166]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/10034 [00:00<?, ? examples/s]

Map:   0%|          | 0/2509 [00:00<?, ? examples/s]

Map:   0%|          | 0/2077 [00:00<?, ? examples/s]

In [238]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [239]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[list(label2id.keys())[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [list(label2id.keys())[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [240]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [242]:
# notebook_login()

In [243]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [116]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/3765 [00:00<?, ?it/s]

{'loss': 0.1563, 'grad_norm': 0.41864439845085144, 'learning_rate': 1.7343957503320053e-05, 'epoch': 0.4}
{'loss': 0.0701, 'grad_norm': 6.179903984069824, 'learning_rate': 1.4687915006640108e-05, 'epoch': 0.8}


  0%|          | 0/314 [00:00<?, ?it/s]

{'eval_loss': 0.06762628257274628, 'eval_precision': 0.7664630006788866, 'eval_recall': 0.7928370786516854, 'eval_f1': 0.7794269934414911, 'eval_accuracy': 0.9797981865444051, 'eval_runtime': 62.9269, 'eval_samples_per_second': 39.872, 'eval_steps_per_second': 4.99, 'epoch': 1.0}
{'loss': 0.056, 'grad_norm': 3.7462689876556396, 'learning_rate': 1.2031872509960161e-05, 'epoch': 1.2}
{'loss': 0.036, 'grad_norm': 1.8037104606628418, 'learning_rate': 9.375830013280214e-06, 'epoch': 1.59}
{'loss': 0.0372, 'grad_norm': 1.9983482360839844, 'learning_rate': 6.719787516600266e-06, 'epoch': 1.99}


  0%|          | 0/314 [00:00<?, ?it/s]

{'eval_loss': 0.06432849168777466, 'eval_precision': 0.7693290734824281, 'eval_recall': 0.8455056179775281, 'eval_f1': 0.8056206088992974, 'eval_accuracy': 0.9826841598952044, 'eval_runtime': 17.1709, 'eval_samples_per_second': 146.12, 'eval_steps_per_second': 18.287, 'epoch': 2.0}
{'loss': 0.0167, 'grad_norm': 0.027124391868710518, 'learning_rate': 4.06374501992032e-06, 'epoch': 2.39}
{'loss': 0.0205, 'grad_norm': 0.7067036628723145, 'learning_rate': 1.407702523240372e-06, 'epoch': 2.79}


  0%|          | 0/314 [00:00<?, ?it/s]

{'eval_loss': 0.06831907480955124, 'eval_precision': 0.7924403183023873, 'eval_recall': 0.839185393258427, 'eval_f1': 0.8151432469304228, 'eval_accuracy': 0.9840759768303416, 'eval_runtime': 14.4109, 'eval_samples_per_second': 174.105, 'eval_steps_per_second': 21.789, 'epoch': 3.0}
{'train_runtime': 1830.307, 'train_samples_per_second': 16.446, 'train_steps_per_second': 2.057, 'train_loss': 0.05359043194794876, 'epoch': 3.0}


TrainOutput(global_step=3765, training_loss=0.05359043194794876, metrics={'train_runtime': 1830.307, 'train_samples_per_second': 16.446, 'train_steps_per_second': 2.057, 'total_flos': 346005790735176.0, 'train_loss': 0.05359043194794876, 'epoch': 3.0})

In [117]:
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/ChristianSneffeFleischer/bert-finetuned-ner/commit/2f9e9b8bc212518bf41e5ebd82edbcd8c71703da', commit_message='Training complete', commit_description='', oid='2f9e9b8bc212518bf41e5ebd82edbcd8c71703da', pr_url=None, pr_revision=None, pr_num=None)

In [118]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/314 [00:00<?, ?it/s]

{'eval_loss': 0.06831907480955124, 'eval_precision': 0.7924403183023873, 'eval_recall': 0.839185393258427, 'eval_f1': 0.8151432469304228, 'eval_accuracy': 0.9840759768303416, 'eval_runtime': 25.5896, 'eval_samples_per_second': 98.048, 'eval_steps_per_second': 12.271, 'epoch': 3.0}


In [19]:
# Make predictions on the test set
predictions, labels, _ = trainer.predict(test_dataset)

AttributeError: 'NoneType' object has no attribute 'get'

In [244]:
def write_iob2_predictions_shifted(test_sents, predictions, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for i, (sent, prediction) in enumerate(zip(test_sents, predictions)):
            outfile.write(f"# sent_id = test-{i+1:04d}\n")
            outfile.write(f"# text = {' '.join(sent)}\n")
            # Shift predictions up by one position
            shifted_predictions = prediction[1:] + ["O"]
            for j, (word, label) in enumerate(zip(sent, shifted_predictions)):
                outfile.write(f"{j+1}\t{word}\t{label}\t-\t-\n")
            outfile.write("\n")

In [None]:
# Convert predictions to label indices
predictions = np.argmax(predictions, axis=-1)

In [210]:
# Convert label indices to label names
true_predictions = [
    [id2label[p] for p in prediction]
    for prediction in predictions
]

# Write the shifted predictions to a file
write_iob2_predictions_shifted(test_sents, true_predictions, 'baseline_preds/baseline_preds_ewt.iob2')

________________________________________________________________

In [3]:
# Testing

model_checkpoint = "ChristianSneffeFleischer/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("In Argentina , beef is revered , respected , and praised .")



[{'entity_group': 'LOC',
  'score': 0.99479514,
  'word': 'Argentina',
  'start': 3,
  'end': 12}]