In [9]:
!pip install transformers
!pip install datasets
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=b73b70d0bf1848e7331a8cf8c5976700a27dc9b47df8e923cc0f11a52540ee3d
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [54]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import os
import re
from sklearn.model_selection import train_test_split

In [68]:
def read_train_set(train_set_file_path):
    with open(train_set_file_path, "r") as fd:
        raw_text = fd.read().strip()
        raw_docs = re.split(r"\n\t?\n", raw_text)
        token_docs = []
        tag_docs = []
        for doc in raw_docs:
            tokens = []
            tags = []
            for line in doc.split("\n"):
                token, tag = line.split("\t")
                tokens.append(token)
                tags.append(tag)
            token_docs.append(tokens)
            tag_docs.append(tags)
        return token_docs, tag_docs

texts, tags = read_train_set(os.path.join(os.getcwd(), "ner_dataset.conll"))

In [101]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=0.2)

In [102]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = { tag: id for id, tag in enumerate(unique_tags) }
id2tag = { id: tag for tag, id in tag2id.items() }

In [103]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
train_encodings = tokenizer(train_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_cased/snapshots/ddf0be025f8e432a1870e34811997ba6725bf04a/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_cased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31116
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_cased/snapshots/ddf0be025f8e432

In [104]:
def align_labels(tags, encodings):
    labels = []
    for i, label in enumerate(tags):
        word_ids = encodings.word_ids(batch_index=i) 
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]]) # here doubt
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    return labels

train_labels = align_labels(train_tags, train_encodings)
val_labels = align_labels(val_tags, val_encodings)

In [105]:
from datasets import load_metric
import numpy as np
metric = load_metric("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [106]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [107]:
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=len(unique_tags))

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_cased/snapshots/ddf0be025f8e432a1870e34811997ba6725bf04a/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_cased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 

In [108]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_encodings, train_labels)
val_dataset = NERDataset(val_encodings, val_labels)

In [109]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    weight_decay=0.01,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [110]:
trainer.train()
trainer.evaluate()

***** Running training *****
  Num examples = 264
  Num Epochs = 25
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 225
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.363476,0.0,0.0,0.0,0.940535
2,No log,0.26165,0.648148,0.255474,0.366492,0.950058
3,No log,0.213975,0.67,0.326034,0.438625,0.953232
4,No log,0.171475,0.733607,0.435523,0.546565,0.95884
5,No log,0.139912,0.726027,0.515815,0.603129,0.965189
6,No log,0.11737,0.675978,0.588808,0.629389,0.969315
7,No log,0.088947,0.706897,0.698297,0.70257,0.977251
8,No log,0.0701,0.748826,0.776156,0.762246,0.981272
9,No log,0.055975,0.795238,0.812652,0.803851,0.986139
10,No log,0.045615,0.821596,0.851582,0.83632,0.989313


***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32
***** Running Evaluation *****
  Num examples = 66
  Batch siz

{'eval_loss': 0.010896655730903149,
 'eval_precision': 0.954653937947494,
 'eval_recall': 0.9732360097323601,
 'eval_f1': 0.963855421686747,
 'eval_accuracy': 0.9978838218177971,
 'eval_runtime': 0.7851,
 'eval_samples_per_second': 84.061,
 'eval_steps_per_second': 3.821,
 'epoch': 25.0}

In [111]:
trainer.save_model('./saved_model')

Saving model checkpoint to ./saved_model
Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


In [112]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./saved_model"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "./saved_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 

In [137]:
print(token_classifier("dropout is set to be 0.4"))

[{'entity_group': 'LABEL_8', 'score': 0.92825025, 'word': 'drop', 'start': 0, 'end': 4}, {'entity_group': 'LABEL_2', 'score': 0.8338615, 'word': '##out', 'start': 4, 'end': 7}, {'entity_group': 'LABEL_6', 'score': 0.9807314, 'word': 'is set to be', 'start': 8, 'end': 20}, {'entity_group': 'LABEL_5', 'score': 0.8858605, 'word': '0', 'start': 21, 'end': 22}, {'entity_group': 'LABEL_8', 'score': 0.17645387, 'word': '.', 'start': 22, 'end': 23}, {'entity_group': 'LABEL_5', 'score': 0.30446973, 'word': '4', 'start': 23, 'end': 24}]


In [87]:
tag2id

{'B-TaskName': 0,
 'I-MetricValue': 1,
 'I-HyperparameterName': 2,
 'B-MetricValue': 3,
 'B-MetricName': 4,
 'B-HyperparameterValue': 5,
 'O': 6,
 'I-MetricName': 7,
 'B-HyperparameterName': 8,
 'B-MethodName': 9,
 'B-DatasetName': 10,
 'I-TaskName': 11,
 'I-DatasetName': 12,
 'I-HyperparameterValue': 13,
 'I-MethodName': 14}

In [138]:
!tar -cvf ner_model.tar saved_model

saved_model/
saved_model/special_tokens_map.json
saved_model/vocab.txt
saved_model/tokenizer.json
saved_model/training_args.bin
saved_model/tokenizer_config.json
saved_model/config.json
saved_model/pytorch_model.bin
