# Setup

## Configs

In [1]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "output"

## Imports

In [2]:
!pip install seqeval evaluate -q

In [3]:
import json
import pandas as pd
import numpy as np
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features

from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

2024-03-29 12:39:02.240985: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 12:39:02.241110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 12:39:02.370707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data (pre)processing

## Get data

In [4]:
# Get training data
train_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))

In [5]:
len(train_data)

6807

## Mapping

In [6]:
# Map labels to ids
all_labels = sorted(list(set(chain(*[x["labels"] for x in train_data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [item for item in all_labels if item != 'O']

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


Transforming the labels into IDs serves several purposes in token classification tasks:

**Consistency in Representation**: By converting labels into IDs, we create a consistent numerical representation for each label. This makes it easier to handle and process the labels during model training and evaluation. Rather than dealing with strings (e.g., "PERSON", "LOCATION"), which might be prone to typos or inconsistencies, we work with numerical IDs, which are more standardized.

**Input for Model**: Most machine learning models, including neural networks commonly used in token classification tasks, operate on numerical inputs. By converting labels into IDs, we make it easier to feed the data into the model. Neural networks require numerical inputs for computation, so feeding textual labels directly into the model isn't feasible.

**Efficient Memory Usage**: IDs typically take up less memory than string labels. When dealing with large datasets, especially in deep learning where memory efficiency is crucial, using numerical IDs can help reduce memory consumption.

## Tokenization

Tokenizations in the dataset and in the model are different. We need a way to map the tokens in the model to the labels in the training set.

In [7]:
# Labelize each character of each token to rebuild indexes after model's tokenization
def rebuild_text(data):
    
    text, labels = [], []
    
    for tok, lab, ws in zip(
        data["tokens"], data["provided_labels"], data["trailing_whitespace"]
    ):
        # append each token to the reconstructed text and the label for each token's character
        text.append(tok)
        labels.extend([lab] * len(tok))
        
        # add space in text if whitespace and label "O"
        if ws:
            text.append(" ")
            labels.append("O")
            
    return text, labels

In [8]:
# Prepare data to be fed to the model & attribute labels to new token format
def tokenize(data, tokenizer, label2id, max_length):
    
    text, labels = rebuild_text(data)
    text = "".join(text)
    labels = np.array(labels)
    token_labels = []
    
    # returns a dictionary-like object containing tokenized inputs and offsets mapping (represents the mapping between the tokens and their corresponding positions in the original text)
    tokenized = tokenizer(text, return_offsets_mapping=True, max_length=max_length)
    
    for start_idx, end_idx in tokenized.offset_mapping:
        
        # if CLS tokens
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue
            
        # if token starts with ws
        if text[start_idx].isspace():
            start_idx += 1
            
        token_labels.append(label2id[labels[start_idx]])
        
    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [10]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in train_data],
    "document": [str(x["document"]) for x in train_data],
    "tokens": [x["tokens"] for x in train_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in train_data],
    "provided_labels": [x["labels"] for x in train_data],
})

In [11]:
# tokenize each row in the dataset
ds = ds.map(tokenize, fn_kwargs={"tokenizer":tokenizer, "label2id":label2id, "max_length":TRAINING_MAX_LENGTH}, num_proc=3)

    

#0:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#1:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#2:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Compare tokens and labels for original dataset and new tokenization
x = ds[0]

for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

print("*"*100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

# Modeling

## Metrics

In [13]:
def compute_metrics(p, all_labels):
    # p is a tuple containing preds and true labels
    predictions, labels = p
    # preds are in form of probs for each label for each token => we take the highest one
    predictions = np.argmax(predictions, axis=2)

    # Remove special tokens from preds and labels
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # Compute metrics using sklearn and own formula
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    # Store metrics and return
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    
    return results

In [14]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Creates a collator object (tailored for token classification tasks)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

## Training

In [16]:
# Define training arguments
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

Let's go through each argument in the TrainingArguments object:

* **output_dir**: This specifies the directory where the trained model and other outputs (like logs and checkpoints) will be saved.

* **fp16**: If set to True, it enables mixed precision training using FP16 (16-bit floating point precision). This can help speed up training and reduce memory usage.

* **learning_rate**: This sets the initial learning rate for the optimizer.

* **num_train_epochs**: This specifies the number of training epochs (passes through the entire training dataset) during training.

* **per_device_train_batch_size**: This sets the batch size for each GPU during training.

* **gradient_accumulation_steps**: This specifies the number of gradient accumulation steps. Gradients are accumulated over multiple steps before performing a weight update. This can be useful when the batch size is limited by memory constraints.

* **report_to**: This specifies where to report training metrics. Setting it to "none" means no reports will be generated.

* **evaluation_strategy**: This defines when evaluation should be performed during training. Setting it to "no" means no evaluation will be performed.

* **do_eval**: If set to True, evaluation will be performed during training.

* **save_total_limit**: This sets the maximum number of checkpoints to save during training.

* **logging_steps**: This specifies how often (in terms of training steps) logging information (like loss and learning rate) should be printed during training.

* **lr_scheduler_type**: This specifies the type of learning rate scheduler to use. Common options include 'linear', 'cosine', and 'constant'.

* **metric_for_best_model**: This specifies the metric used to determine the best model checkpoint during training. In this case, it's set to "f1".

* **greater_is_better**: This indicates whether a higher value of the specified metric is better. In this case, it's set to True, indicating that a higher F1 score is better.

* **warmup_ratio**: This specifies the ratio of warmup steps to total training steps. Warmup steps gradually increase the learning rate from zero to its initial value.

* **weight_decay**: This specifies the weight decay (L2 regularization) to apply to model parameters during optimization.

In [17]:
# Define trainer object (responsible for orchestrating the training process)
trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
%%time
trainer.train()

Step,Training Loss
20,2.7347
40,1.1778
60,0.0341
80,0.0081
100,0.0069
120,0.0072
140,0.0148
160,0.0116
180,0.0132
200,0.0116


CPU times: user 18min 9s, sys: 4min 21s, total: 22min 31s
Wall time: 22min 31s


TrainOutput(global_step=851, training_loss=0.09749114885160176, metrics={'train_runtime': 1350.9476, 'train_samples_per_second': 5.039, 'train_steps_per_second': 0.63, 'total_flos': 3161498795311008.0, 'train_loss': 0.09749114885160176, 'epoch': 1.0})

## Save model

In [19]:
trainer.save_model("deberta3base_1024")
tokenizer.save_pretrained("deberta3base_1024")

('deberta3base_1024/tokenizer_config.json',
 'deberta3base_1024/special_tokens_map.json',
 'deberta3base_1024/spm.model',
 'deberta3base_1024/added_tokens.json',
 'deberta3base_1024/tokenizer.json')