In [11]:
!pip install mlflow transformers datasets evaluate seqeval torch torchvision

Collecting torchvision
  Downloading torchvision-0.15.1-cp38-cp38-manylinux1_x86_64.whl (33.8 MB)
[K     |████████████████████████████████| 33.8 MB 5.4 MB/s eta 0:00:01
Collecting pillow!=8.3.*,>=5.3.0
  Downloading Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.0 MB/s eta 0:00:01
Installing collected packages: pillow, torchvision
Successfully installed pillow-9.5.0 torchvision-0.15.1


In [1]:
import os

import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from matplotlib import pyplot as plt
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import mlflow

# Only available in Jupyter
# notebook_login() 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset

# Load our DatasetDict
ds = load_dataset("wnut_17")
print("Splits:", list(ds.keys()))
print("Columns", list(ds["train"][0].keys()))
# Extract our labels
tags_name = "tags" if "tags" in ds["train"].features else "ner_tags"
assert tags_name in ds["train"].features, (
    "Your dataset must have `tags` or `ner_tags` to perform token classification"
)

labels = ds["train"].features[tags_name].feature.names
print("Labels", labels)

Found cached dataset wnut_17 (/home/ubuntu/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)
100%|██████████| 3/3 [00:00<00:00, 545.99it/s]

Splits: ['train', 'validation', 'test']
Columns ['id', 'tokens', 'ner_tags']
Labels ['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product']





In [3]:
# The model we will be fine-tuning
HF_MODEL = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)

In [4]:
from datasets.formatting.formatting import LazyBatch
from transformers import BatchEncoding
from datasets import Dataset, DatasetDict

def tokenize_and_align_labels(examples):
    """Tokenize inputs and align token values to their labels
    
    Specifically, when tokenizing, special tokens such as [CLS] and [SEP] which create mismatches between
    actual token inputs and their labels. 
    
    We realign the tokens and labels bt:
        1. Mapping all tokens to their corresponding word with the word_ids method.
        2. Assigning the label -100 to the special tokens [CLS] and [SEP] 
            so they’re ignored by the PyTorch loss function.
        3. Only labeling the first token of a given word. Assign -100 to 
            other subtokens from the same word.
            
    For more information, see: https://huggingface.co/docs/transformers/tasks/token_classification
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[tags_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


ds_encoded = ds.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9/cache-129ca94b3046f87f.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9/cache-358d7c3148e7c56f.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9/cache-e3d161d63d639061.arrow


In [5]:
from transformers import DataCollatorForTokenClassification

# (Assuming PyTorch) we create a collator to pad the sentences to the max 
# input length during batch creation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [6]:
from typing import Dict
from transformers import EvalPrediction
import evaluate
import numpy as np

# In token classification, we use seqeval to compute metrics. 
# We don't take this from the user
seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    """We use seqeval during training to compute precision, recall, f1, and accuracy
    
    Seqeval is the standard for metric computation in token classification. 
    We preprocess the predictions and labels to remove the -100 ([CLS] and [SEP] tokens)
    """
    predictions, prediction_labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, prediction_labels)
    ]
    true_labels = [
        [labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, prediction_labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [7]:
# In order to create our model, we create idx2label and label2idx maps

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Now load our model to fine-tune

model = AutoModelForTokenClassification.from_pretrained(
    HF_MODEL, num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [8]:
import os

# https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/callback#transformers.integrations.MLflowCallback
# os.environ["MLFLOW_EXPERIMENT_NAME"] = "trainer-mlflow-demo"

# Name taken from user
exp = mlflow.set_experiment("mlflow-huggingface-demo")
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="1"

In [9]:
eval_strategy = "epoch"  # Batch or epoch, take from user

training_args = TrainingArguments(
#     hub_model_id="juliensimon/bert-finetune-wnut",  # If we want to push_to_hub, this must be set
    learning_rate=2e-5,  # Take from user, this is the default
    per_device_train_batch_size=16, # Take from user, this is the default
    per_device_eval_batch_size=16,  # Take from user, this is the default
    weight_decay=0.01,  # Take from user, this is the default
    num_train_epochs=5,  # From the user
    output_dir="./output",  # From the user, default value
    logging_steps=500,      # Take from user
    evaluation_strategy=eval_strategy,
    save_strategy=eval_strategy, 
    push_to_hub=False,  # From the user, needs to set `hub_model_id` if True
    report_to="mlflow",
    seed=101,           # Set a seed for reproducibility 
    load_best_model_at_end=True  # Required if early stopping is True (see next cell)
    
)

has_val = "validation" in ds
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["validation"] if has_val else ds_encoded["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # Ask user if early stopping (default True), and ask for patience (default 1 or 2.. We can decide)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],  
)


In [10]:
with mlflow.start_run() as run:
    trainer.train()
    components = {
        "model": model,
        "tokenizer": tokenizer,
    }
    mlflow.transformers.log_model(
        transformers_model=components,
        artifact_path="my_model",
    )

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.244613,0.708511,0.398325,0.509954,0.94437
2,No log,0.223961,0.675302,0.467703,0.55265,0.951618
3,0.175900,0.243177,0.691152,0.495215,0.577003,0.953525


  _warn_prf(average, modifier, msg_start, len(result))
