In [None]:
!pip install datasets
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

from datasets import load_dataset,concatenate_datasets, load_metric
import pandas as pd
import numpy as np

 # Load Data

In [None]:
wnut = load_dataset("wnut_17")

Reusing dataset wnut_17 (/root/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [None]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
id2tag = {id: tag for id, tag in enumerate(label_list)}
id2tag

{0: 'O',
 1: 'B-corporation',
 2: 'I-corporation',
 3: 'B-creative-work',
 4: 'I-creative-work',
 5: 'B-group',
 6: 'I-group',
 7: 'B-location',
 8: 'I-location',
 9: 'B-person',
 10: 'I-person',
 11: 'B-product',
 12: 'I-product'}

In [None]:
# merge train & validation sets
from datasets import concatenate_datasets

train_dataset = concatenate_datasets([wnut["train"],wnut["validation"]])
train_dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 4403
})

In [None]:
ith_example=2

print(wnut["train"][ith_example]['tokens'])
print([id2tag[label] for label in train_dataset[ith_example]['ner_tags']])

['Pxleyes', 'Top', '50', 'Photography', 'Contest', 'Pictures', 'of', 'August', '2010', '...', 'http://bit.ly/bgCyZ0', '#photography']
['B-corporation', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


# Preprocessing

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenized_input = tokenizer(wnut["train"][2]["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [None]:
#input
print(wnut["train"][2]["tokens"])

['Pxleyes', 'Top', '50', 'Photography', 'Contest', 'Pictures', 'of', 'August', '2010', '...', 'http://bit.ly/bgCyZ0', '#photography']


In [None]:
#tokenized
print(tokens)

['[CLS]', 'p', '##xley', '##es', 'top', '50', 'photography', 'contest', 'pictures', 'of', 'august', '2010', '.', '.', '.', 'http', ':', '/', '/', 'bit', '.', 'l', '##y', '/', 'b', '##gc', '##y', '##z', '##0', '#', 'photography', '[SEP]']


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
tokenized_train_dataset[0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  1030,
  2703,
  17122,
  2009,
  1005,
  1055,
  1996,
  3193,
  2013,
  2073,
  1045,
  1005,
  1049,
  2542,
  2005,
  2048,
  3134,
  1012,
  3400,
  2110,
  2311,
  1027,
  9686,
  2497,
  1012,
  3492,
  2919,
  4040,
  2182,
  2197,
  3944,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [None]:
id2tag[-100]='ignore'
exml=tokenized_train_dataset[2]

pd.DataFrame({'tokens':tokenizer.convert_ids_to_tokens(exml["input_ids"]), 'ner_labels':exml['labels'], 'ner_tags': [id2tag[label] for label in exml['labels']] })

Unnamed: 0,tokens,ner_labels,ner_tags
0,[CLS],-100,ignore
1,p,1,B-corporation
2,##xley,-100,ignore
3,##es,-100,ignore
4,top,0,O
5,50,0,O
6,photography,0,O
7,contest,0,O
8,pictures,0,O
9,of,0,O


# Baseline Model

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))

0.5888494815191806

In [None]:
exploded_values=pd.Series(tokenized_train_dataset['labels']).explode()
exploded_values=pd.DataFrame(exploded_values,columns=['B'])

most_frequent_elem_by_doc=pd.Series(tokenized_train_dataset['labels']).apply(lambda x:  max(set(x), key=x.count))
most_frequent_elem_by_doc=pd.DataFrame(most_frequent_elem_by_doc,columns=list('A'))

df_most_freq_token=exploded_values.merge(most_frequent_elem_by_doc, how='right', left_index=True, right_index=True)

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])

0.7197897448947134

# Model

In [None]:
#Data Collator

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

In [None]:
metric_seqeval = load_metric("seqeval")
example = wnut["train"][2]

labels = [label_list[i] for i in example["ner_tags"]]
metric_seqeval.compute(predictions=[labels], references=[labels])

{'corporation': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./log_results',
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    warmup_steps=500, 
    eval_steps=60,
    save_steps=60,
    evaluation_strategy="steps",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 6)]
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id. If tokens, ner_tags, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4403
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1380


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
60,No log,1.846153,0.007656,0.014829,0.010098,0.838057
120,No log,0.456,0.0,0.0,0.0,0.925612
180,No log,0.363021,0.0,0.0,0.0,0.925612
240,No log,0.308753,0.374741,0.167748,0.231754,0.933992
300,No log,0.296595,0.502347,0.198332,0.284385,0.936984
360,No log,0.257092,0.398015,0.334569,0.363545,0.939549
420,No log,0.253311,0.478161,0.385542,0.426886,0.941943
480,No log,0.247058,0.54125,0.401297,0.460883,0.945406
540,0.522800,0.236033,0.520979,0.414272,0.461538,0.945321
600,0.522800,0.252098,0.533019,0.418906,0.469123,0.947458


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id. If tokens, ner_tags, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 64
Saving model checkpoint to ./log_results/checkpoint-60
Configuration saved in ./log_results/checkpoint-60/config.json
Model weights saved in ./log_results/checkpoint-60/pytorch_model.bin
tokenizer config file saved in ./log_results/checkpoint-60/tokenizer_config.json
Special tokens file saved in ./log_results/checkpoint-60/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id. If tokens, ner_tags, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Runnin

TrainOutput(global_step=900, training_loss=0.32720259348551434, metrics={'train_runtime': 349.3096, 'train_samples_per_second': 63.024, 'train_steps_per_second': 3.951, 'total_flos': 381625627721310.0, 'train_loss': 0.32720259348551434, 'epoch': 3.26})

In [None]:
predictions, labels, _ = trainer.predict(tokenized_wnut["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id. If tokens, ner_tags, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1287
  Batch size = 64


{'corporation': {'f1': 0.28409090909090906,
  'number': 66,
  'precision': 0.22727272727272727,
  'recall': 0.3787878787878788},
 'creative-work': {'f1': 0.1782178217821782,
  'number': 142,
  'precision': 0.3,
  'recall': 0.1267605633802817},
 'group': {'f1': 0.0670391061452514,
  'number': 165,
  'precision': 0.42857142857142855,
  'recall': 0.03636363636363636},
 'location': {'f1': 0.51875,
  'number': 150,
  'precision': 0.48823529411764705,
  'recall': 0.5533333333333333},
 'overall_accuracy': 0.945320849899534,
 'overall_f1': 0.4615384615384615,
 'overall_precision': 0.5209790209790209,
 'overall_recall': 0.41427247451343835,
 'person': {'f1': 0.6755555555555556,
  'number': 429,
  'precision': 0.6454352441613588,
  'recall': 0.7086247086247086},
 'product': {'f1': 0.1375,
  'number': 127,
  'precision': 0.3333333333333333,
  'recall': 0.08661417322834646}}

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id. If tokens, ner_tags, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 64


{'epoch': 3.26,
 'eval_accuracy': 0.945320849899534,
 'eval_f1': 0.4615384615384615,
 'eval_loss': 0.23603276908397675,
 'eval_precision': 0.5209790209790209,
 'eval_recall': 0.41427247451343835,
 'eval_runtime': 7.4252,
 'eval_samples_per_second': 173.329,
 'eval_steps_per_second': 2.828}

In [None]:
def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()]) 
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [None]:
text = """Celebrities and tourists from United States are 
flooding into Greece. But a harsh winter isn’t far off"""

print(tag_sentence(text))

           word         tag
0         [CLS]           O
1   celebrities           O
2           and           O
3      tourists           O
4          from           O
5        united  B-location
6        states  I-location
7           are           O
8      flooding           O
9          into           O
10       greece  B-location
11            .           O
12          but           O
13            a           O
14        harsh           O
15       winter           O
16          isn           O
17            ’           O
18            t           O
19          far           O
20          off           O
21        [SEP]           O


In [None]:
text="""Apple unveils all-new MacBook Air, 
supercharged by the new M2 chip"""

print(tag_sentence(text))

         word            tag
0       [CLS]              O
1       apple  B-corporation
2          un              O
3        ##ve              O
4       ##ils              O
5         all              O
6           -              O
7         new              O
8         mac      B-product
9      ##book      I-product
10        air      I-product
11          ,              O
12      super              O
13  ##charged              O
14         by              O
15        the              O
16        new              O
17         m2      B-product
18       chip              O
19      [SEP]              O


In [None]:
!rm -r log_results/