In [1]:
import torch

In [40]:
from datasets import load_dataset

dataset = load_dataset("tner/conll2003")

conll2003 = dataset

Found cached dataset conll2003 (/home/artak/.cache/huggingface/datasets/tner___conll2003/conll2003/1.0.0/584600e9fcc12f281243c11ee1532e6cfeb74028655978528ee02b630992dcb1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [27]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [7]:
# Import the AutoTokenizer class from the transformers library
# AutoTokenizer provides access to tokenizers available in the transformers library in a unified way
from transformers import AutoTokenizer

# Sets the model_checkpoint variable to the string
# "bert-base-cased", which is the name of a pre-trained BERT model.
# The “cased” part means that the model was trained on case-sensitive data
model_checkpoint = "bert-base-cased"
# Loads the tokenizer associated with the "bert-base-cased" model and assigns it to the variable tokenizer.
# The from_pretrained method downloads and caches the tokenizer, and then returns an instance of it.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [9]:
train_dataset = dataset['train']

In [14]:
res = tokenizer(' '.join(train_dataset[0]['tokens']))

train_dataset[0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [35]:
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [37]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [44]:
label2id = {
  "O": 0,
  "B-ORG": 1,
  "B-MISC": 2,
  "B-PER": 3,
  "I-PER": 4,
  "B-LOC": 5,
  "I-ORG": 6,
  "I-MISC": 7,
  "I-LOC": 8
}

In [56]:
transformation = {
    0:0,
    1:3,
    2:7,
    3:1,
    4:2,
    5:5,
    6:4,
    7:8,
    8:6
}


new_label2id = {
    'O' : 0,
    'B-PER' :1, 
    'I-PER':2, 
    'B-ORG':3, 
    'I-ORG':4,
    'B-LOC':5, 
    'I-LOC':6,
    'B-MISC':7, 
    'I-MISC':8
}

In [61]:
def transform_tags(example):
    tags = example['tags'] 
    
    new_tags = [transformation[tag] for tag in tags]

    example['tags'] = new_tags

    return example

conll2003 = conll2003.map(transform_tags,batched = False)

Loading cached processed dataset at /home/artak/.cache/huggingface/datasets/tner___conll2003/conll2003/1.0.0/584600e9fcc12f281243c11ee1532e6cfeb74028655978528ee02b630992dcb1/cache-6e7fbeda2c1f0284.arrow
Loading cached processed dataset at /home/artak/.cache/huggingface/datasets/tner___conll2003/conll2003/1.0.0/584600e9fcc12f281243c11ee1532e6cfeb74028655978528ee02b630992dcb1/cache-d23cfaaafed19696.arrow
Loading cached processed dataset at /home/artak/.cache/huggingface/datasets/tner___conll2003/conll2003/1.0.0/584600e9fcc12f281243c11ee1532e6cfeb74028655978528ee02b630992dcb1/cache-27448911f92ad090.arrow


In [62]:
conll2003['train'][0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [63]:
# Fix labels according to word_ids
def align_labels_with_tokens(labels, word_ids):
    # Initialize a list to store the adjusted labels
    new_labels = []

    # Initialize a variable to keep track of the current word's ID
    current_word = None

    # Iterate through each word ID in the word_ids list
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word/entity
            current_word = word_id

            # Assign -100 to labels for special tokens, else use the word's label
            label = -100 if word_id is None else labels[word_id]

            # Append the adjusted label to the new_labels list
            new_labels.append(label)
        elif word_id is None:
            # Handle special tokens by assigning them a label of -100
            new_labels.append(-100)
        else:
            # Token belongs to the same word/entity as the previous token
            label = labels[word_id]

            # If the label is in the form B-XXX, change it to I-XXX
            if label % 2 == 1:
                label += 1

            # Append the adjusted label to the new_labels list
            new_labels.append(label)

    # Return the list of adjusted labels
    return new_labels

In [64]:
labels = conll2003["train"][0]["tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [54]:
word_ids

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [68]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [69]:
tokenized_datasets = conll2003.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=conll2003["train"].column_names,
)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [70]:
tokenized_datasets['train'][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

In [71]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [129]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(5)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    5,    6,    6,    6,    0,    0,    0,    0,    0, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    3,    4,    0,    0,    0,    0,    0,    0,    7,    0,
            0,    0,    0,    0,    0,    7,    0,    0,    0,    0,    0,    0,
            0,    0,    0

In [78]:
import evaluate
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [85]:
metric.compute(predictions=[['0','1','2','3']], references=[['0','1','2','5']])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.75}

In [86]:
import numpy as np


label_names = {v:k for k,v in new_label2id.items()}

# Function compute_metrics used to compute the precision, recall
# F1 score, and accuracy of the predictions made by a model.

def compute_metrics(eval_preds):

    # Unpack eval_preds into logits and labels
    # logits are the raw output values from the model
    # labels are the true labels.
    logits, labels = eval_preds


    # Find the indices of the maximum values along the last axis of logits 
    # Indices represent the model's predictions
    predictions = np.argmax(logits, axis=-1)


    # Remove ignored index (special tokens) and convert to labels
    # creates a new list of labels called true_labels,
    # iterates over labels and replacing each label l with
    # its corresponding name from label_names
    # but only if l is not equal to -100 (special tokens)
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    

    # This line creates a new list of predictions, true_predictions, 
    # by iterating over predictions and labels together, replacing each 
    # prediction p with its corresponding name from label_names, but only 
    # if the corresponding label l is not equal to -100
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    # This line computes the metrics by calling the compute method 
    # of the metric object with true_predictions and true_labels as arguments.
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [87]:
# Import token classification model to be trained or fine-tuned on tasks such as Named Entity Recognition (NER), Part-of-Speech tagging (POS)
from transformers import AutoModelForTokenClassification

# Create two dictionaries: id2label and label2id.
# id2label maps each label’s ID to its name.
# label2id maps each label’s name to its ID.
# These dictionaries are used to convert between label names and ID

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# Load a pre-trained model for token classification from the checkpoint specified by model_checkpoint,
# Configures it to use the specific labels defined by id2label and label2id.
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Sets the device where the PyTorch tensors will be allocated on.
torch.device('cuda')

[2024-05-12 18:41:17,942] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cl

device(type='cuda')

In [107]:
def train()
    

Linear(in_features=768, out_features=9, bias=True)

In [100]:
#  imports the accelerate library, which is a PyTorch utility for easy multi-GPU and TPU training.
import accelerate

# Import the TrainingArguments class from the transformers library.
# Class is used to set various parameters for training a model.
from transformers import TrainingArguments

args = TrainingArguments(

    # Output directory where the model predictions and checkpoints will be written.
    "bert-finetuned-ner4",

    # Model checkpoint will be saved at the end of each epoch.
    evaluation_strategy="epoch",

    # Model will be evaluated at the end of each epoch.
    #save_strategy="epoch",

    # Learning rate for the optimizer.
    # Controls how much to change the model in response to the estimated error each time the model weights are updated.
    learning_rate=2e-5,

    # Total number of training epochs to perform.
    # An epoch is one complete pass through the entire training dataset.
    num_train_epochs=3,

    # Weight decay to apply (if not zero).
    # Weight decay is a regularization technique by adding a small penalty, usually the L2 norm of the weights, to the loss function to reduce overfitting.
    weight_decay=0.01,

    # This means the model, tokenizer, and model configuration will be pushed to the Hugging Face Model Hub at each save.
    push_to_hub=False,
    
    report_to = "none",
)

PyTorch: setting up devices


In [101]:
# This line imports the Trainer class from the transformers library.
# This class provides a simple way to train and fine-tune the models.
from transformers import Trainer

# This line creates an instance of the Trainer class with the specified parameters
trainer = Trainer(
    # Load the model that will be trained.
    model=model,
    # Load the training arguments that define the training setup.
    args=args,
    # Load the training dataset.
    train_dataset=tokenized_datasets["train"],
    # Load the validation dataset.
    eval_dataset=tokenized_datasets["validation"],
    # Load the function that will be used to form a batch by collating several samples together.
    data_collator=data_collator,
    # Load that will be used to compute metrics for evaluation.
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268
  Number of trainable parameters = 107726601


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0852,0.072473,0.903453,0.929148,0.91612,0.981325
2,0.0349,0.067995,0.922875,0.946483,0.93453,0.985532
3,0.0172,0.062807,0.930551,0.949344,0.939853,0.986407


Saving model checkpoint to bert-finetuned-ner4/checkpoint-500
Configuration saved in bert-finetuned-ner4/checkpoint-500/config.json
Model weights saved in bert-finetuned-ner4/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner4/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner4/checkpoint-500/special_tokens_map.json
Saving model checkpoint to bert-finetuned-ner4/checkpoint-1000
Configuration saved in bert-finetuned-ner4/checkpoint-1000/config.json
Model weights saved in bert-finetuned-ner4/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner4/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner4/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to bert-finetuned-ner4/checkpoint-1500
Configuration saved in bert-finetuned-ner4/checkpoint-1500/config.json
Model weights saved in bert-finetuned-ner4/checkpoint-1500/pytorch_model.bin
tokenizer config

TrainOutput(global_step=5268, training_loss=0.06622885272823326, metrics={'train_runtime': 371.7676, 'train_samples_per_second': 113.305, 'train_steps_per_second': 14.17, 'total_flos': 968504132020698.0, 'train_loss': 0.06622885272823326, 'epoch': 3.0})

In [105]:
??model

[0;31mSignature:[0m      [0mmodel[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           BertForTokenClassification
[0;31mString form:[0m   
BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
     

In [104]:
tokenized_datasets['train'][0:2]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def train(model, train_loader, criterion, optimizer, device):
    model.train()  # Set the model to train mode
    running_loss = 0.0
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute the loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss

def evaluate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute the loss
            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(val_loader.dataset)
    return epoch_loss


def train_epochs(model, train_loader, val_loader, criterion, optimizer, device, num_epochs):
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, criterion, optimizer, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return train_losses, val_losses


In [118]:
tokenized_datasets['train'][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

In [126]:
data_collator(tokenized_datasets['train'][0:5])

In [None]:
from torch.utils.data import Dataset, DataLoader

class CONLDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset, collator):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataset = dataset
        self.collator = collator

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        

In [127]:
??model

[0;31mSignature:[0m      [0mmodel[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           BertForTokenClassification
[0;31mString form:[0m   
BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
     