In [1]:
!pip install transformers
!pip install datasets
!pip install seqeval



In [2]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import os
import re
from sklearn.model_selection import train_test_split

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 3 files to the new cache system


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
def read_train_set(train_set_file_path):
    with open(train_set_file_path, "r") as fd:
        raw_text = fd.read().strip()
        raw_docs = re.split(r"\n\t?\n", raw_text)
        token_docs = []
        tag_docs = []
        for doc in raw_docs:
            tokens = []
            tags = []
            for line in doc.split("\n"):
                token, tag = line.split("\t")
                tokens.append(token)
                tags.append(tag)
            token_docs.append(tokens)
            tag_docs.append(tags)
        return token_docs, tag_docs

texts, tags = read_train_set(os.path.join(os.getcwd(), "data/conll/ner_dataset.conll"))

In [4]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=0.2)

In [5]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = { tag: id for id, tag in enumerate(unique_tags) }
id2tag = { id: tag for tag, id in tag2id.items() }

In [6]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
train_encodings = tokenizer(train_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)

In [8]:
def align_labels(tags, encodings):
    labels = []
    for i, label in enumerate(tags):
        word_ids = encodings.word_ids(batch_index=i) 
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]]) # here doubt
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    return labels

train_labels = align_labels(train_tags, train_encodings)
val_labels = align_labels(val_tags, val_encodings)

In [9]:
from datasets import load_metric
import numpy as np
metric = load_metric("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=len(unique_tags))

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initial

In [12]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_encodings, train_labels)
val_dataset = NERDataset(val_encodings, val_labels)

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    weight_decay=0.01,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
class MyTrainer(Trainer):
    def __init__(self,
                model = None, 
                args = None, 
                data_collator = None, 
                train_dataset = None,  
                eval_dataset = None, 
                tokenizer = None, 
                model_init = None,
                compute_metrics = None, 
                callbacks = None, 
                optimizers = (None, None), 
                preprocess_logits_for_metrics = None
                ):

        super().__init__(model=model, 
                        args=training_args,
                        train_dataset=train_dataset,
                        eval_dataset=val_dataset,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics
                        )
    
    def train(self):
        '''
        model.train()
        for epoch in num_epochs:
            for idx, train_inputs in enumerate train dataloader:
                train_loss += training_step(model, train_inputs)
            avg_train_loss = train_loss / len(train dataloader)

        make ouputs in proper format
        make metrics
        return (..., outputs and metrics)
        '''
    def training_step(self, model, inputs):
        '''
        model.train()
        optim.zero_grad()
        * optional: inputs = self._prepare_inputs(inputs)
        loss = compute_loss(model, inputs)
        loss.backward()
        optim.step()
        print loss
        return loss
        '''
    
    def evaluate(self, eval_dataset = None, ignore_keys = None, metric_key_prefix = "eval"):
        '''
        model.eval()
        for idx, val_inputs in enumerate val dataloader:
            validation_step(model, val_inputs)
            val_loss = val_step(model, val_inputs)
        avg_val_loss = val_loss / len(val dataloader)
        
        make ouputs in proper format
        make metrics
        return (..., outputs and metrics)
        '''
        return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)

    def validation_step(self, model, inputs):
        '''
        with torch.no_grad():
            model.eval()
            optim.zero_grad()
            * optional: inputs = self._prepare_inputs(inputs)
            loss = compute_loss(model, inputs)
            optim.step()
            print loss
        return loss
        '''


In [25]:
my_trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [29]:
help(MyTrainer.train)

Help on function train in module __main__:

train(self)
    Main training entry point.
    
    Args:
        resume_from_checkpoint (`str` or `bool`, *optional*):
            If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
            `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
            of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
        trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
            The trial run or the hyperparameter dictionary for hyperparameter search.
        ignore_keys_for_eval (`List[str]`, *optional*)
            A list of keys in the output of your model (if it is a dictionary) that should be ignored when
            gathering predictions for evaluation during the training.
        kwargs:
            Additional keyword arguments used to hide deprecated arguments



In [None]:
trainer.train()
trainer.evaluate()

***** Running training *****
  Num examples = 264
  Num Epochs = 25
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 225


  0%|          | 0/225 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3632640242576599, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9416267137690302, 'eval_runtime': 183.0875, 'eval_samples_per_second': 0.36, 'eval_steps_per_second': 0.016, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.2578881084918976, 'eval_precision': 0.8688524590163934, 'eval_recall': 0.3081395348837209, 'eval_f1': 0.4549356223175966, 'eval_accuracy': 0.954916309193372, 'eval_runtime': 184.9391, 'eval_samples_per_second': 0.357, 'eval_steps_per_second': 0.016, 'epoch': 2.0}


KeyboardInterrupt: 

In [None]:
trainer.save_model('./saved_model')

Saving model checkpoint to ./saved_model
Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./saved_model"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "./saved_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 

In [None]:
print(token_classifier("dropout is set to be 0.4"))

[{'entity_group': 'LABEL_8', 'score': 0.92825025, 'word': 'drop', 'start': 0, 'end': 4}, {'entity_group': 'LABEL_2', 'score': 0.8338615, 'word': '##out', 'start': 4, 'end': 7}, {'entity_group': 'LABEL_6', 'score': 0.9807314, 'word': 'is set to be', 'start': 8, 'end': 20}, {'entity_group': 'LABEL_5', 'score': 0.8858605, 'word': '0', 'start': 21, 'end': 22}, {'entity_group': 'LABEL_8', 'score': 0.17645387, 'word': '.', 'start': 22, 'end': 23}, {'entity_group': 'LABEL_5', 'score': 0.30446973, 'word': '4', 'start': 23, 'end': 24}]


In [None]:
tag2id

{'B-TaskName': 0,
 'I-MetricValue': 1,
 'I-HyperparameterName': 2,
 'B-MetricValue': 3,
 'B-MetricName': 4,
 'B-HyperparameterValue': 5,
 'O': 6,
 'I-MetricName': 7,
 'B-HyperparameterName': 8,
 'B-MethodName': 9,
 'B-DatasetName': 10,
 'I-TaskName': 11,
 'I-DatasetName': 12,
 'I-HyperparameterValue': 13,
 'I-MethodName': 14}

In [None]:
!tar -cvf ner_model.tar saved_model

saved_model/
saved_model/special_tokens_map.json
saved_model/vocab.txt
saved_model/tokenizer.json
saved_model/training_args.bin
saved_model/tokenizer_config.json
saved_model/config.json
saved_model/pytorch_model.bin
