# Training Notebook

# Overview
    - update notebooks to get datasets if on kaggle
## Training Notes
    - Trained on kaggle GPU t4 x2
    - 
    
## Changes and Iterations


# Imports


In [1]:
!pip install seqeval evaluate -q
!pip download  seqeval -q

In [2]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-03-08 01:00:19.768433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 01:00:19.768546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 01:00:19.953456: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Configuration

In [3]:
#DATA_PATH = '../input/pii-detection-removal-from-educational-data'

TRAINING_DATA_PATH = "/kaggle/input/pii-detect-miniset-and-validation-ds/mini_no_overlap.json"
TRAINING_MODEL_PATH = "microsoft/deberta-v3-xsmall" #pretrained backbone model
TRAINING_MAX_LENGTH = 1024 # max size of input sequence for training

#Here or in body of notebook?
# TRAIN_BATCH_SIZE = 2 * 8 # size of the input batch in training, x 2 as two GPUs
# EPOCHS = 6 # number of epochs to train
# LR_MODE = "exp" # lr scheduler mode from one of "cos", "step", "exp"

FINE_TUNED_NAME = "deberta3_xsmall_pii2d_1024_mini_v1"
OUTPUT_DIR = "/kaggle/working/"

NOTEBOOK_SEED= 42

LABEL_SET = ["B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
          "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME",
          "I-ID_NUM", "I-NAME_STUDENT", "I-PHONE_NUM",
          "I-STREET_ADDRESS","I-URL_PERSONAL","O"]

# #print files with pathname
# for dirname, _, filenames in os.walk(DATA_PATH):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [4]:
# Trainer API Configs
# if commented out then it is commented out in trainer API args

#LR = 2e-5  # Initial learning rate
GRADIENT_ACCUMULATION_STEPS = 2  # How many batches to accumulate gradient before optimization if batch size limited by GPU memory
REPORT_TO = "none"  # Where training report progress, "none" prevents wandb login
NUM_TRAIN_EPOCHS = 2  # Number of training epochs
PER_DEVICE_TRAIN_BATCH_SIZE = 4  # Batch size based per GPU
DO_EVAL = False  # Whether or not to perform eval during training
EVALUATION_STRATEGY = "no"  # When to evaluate during training {no, steps or epoch}
# LOGGING_DIR = OUTPUT_DIR + "/logs"  # Directory to save training logs
LOGGING_STEPS = 100  # Log training progress every X steps
# LOAD_BEST_MODEL_AT_END = True  # Load the best model at the end of training
# METRIC_FOR_BEST_MODEL = "f5"  # Metric to determine the best model ("accuracy", f1...)
# GREATER_IS_BETTER = True  # If higher eval metric is better. True for f1 and acc
SAVE_TOTAL_LIMIT = 1  # How many checkpoints to keep at end (1 means most recent)
# WARMUP_RATIO = 0.1  # Steps to gradually increase learning rate. Can help stabilize training at beginning
# WEIGHT_DECAY = 0.01  # L2 regularization to prevent overfitting

## Hardware Selection?

## Data Selection

In [5]:
#data from orginal training json
data = json.load(open(TRAINING_DATA_PATH))
org_data_df = pd.DataFrame(data)
train_df = org_data_df

print("Training Data: ", len(data))

Training Data:  1837


# Tokenize Data

In [6]:
#prep data for NER training by tokenize the text and align labels to tokens
def tokenize(example, tokenizer, label2id, max_length):
    """This function ensures that the text is correctly tokenized and the labels 
    are correctly aligned with the tokens for NER training.

    Args:
        example (dict): The example containing the text and labels.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: The tokenized example with aligned labels.

    Reference: credit to https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """

    # rebuild text from tokens
    text = []
    labels = []

    #iterate through tokens, labels, and trailing whitespace using zip to create tuple from three lists
    for t, l, ws in zip(
        example["tokens"], example["labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        
        #extend so we can add multiple elements to end of list if ws
        labels.extend([l] * len(t))
        if ws:
            text.append(" ")
            labels.append("O")

    #Tokenize text and return offsets for start and end character position. Limit length of tokenized text.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length, truncation=True)

    #convert to np array for indexing
    labels = np.array(labels)

    # join text list into a single string 
    text = "".join(text)
    token_labels = []

    #iterate through each tolken
    for start_idx, end_idx in tokenized.offset_mapping:
        #if special tolken (CLS token) then append O
        #CLS : classification token added to the start of each sequence
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        #append orginal label to token_labels
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [7]:
#Set up labeling for NER with #Targets: B-Beginning entity, I-inside entity, O- outside entity

#Extract all unique labels w/ list comprehension. Use chain to flatten list of lists
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))

#Create dictionary of label to id
label2id = {l: i for i,l in enumerate(all_labels)}

#Create dictionary of id to label
id2label = {v:k for k,v in label2id.items()}

#target labels identified in the training data- changed to all possible target labels
target = [
    'B-NAME_STUDENT', 'B-EMAIL','B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM',
    'B-URL_PERSONAL', 'B-STREET_ADDRESS',
    'I-NAME_STUDENT', 'I-EMAIL','B-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM',
    'I-URL_PERSONAL', 'I-STREET_ADDRESS', 'O'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-URL_PERSONAL', 5: 'B-USERNAME', 6: 'I-NAME_STUDENT', 7: 'I-PHONE_NUM', 8: 'O'}


In [8]:
#load tokenizer based on pretrained model
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

#convert to hugging face Dataset object
ds = Dataset.from_pandas(train_df)

# Map the tokenize function to your dataset
ds = ds.map(
    tokenize,
    fn_kwargs={      # pass keyword args
        "tokenizer": tokenizer,
        "label2id": label2id,
        "max_length": TRAINING_MAX_LENGTH
    }, 
    num_proc=2   #apply in paralell using 3 processes
)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



   

#0:   0%|          | 0/919 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/918 [00:00<?, ?ex/s]

# Metrics and Training

In [9]:
#TODO- Review and confirm works
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score


def compute_metrics(p, all_labels):
    """Compute the F1, recall, precision metrics for a NER task.

    Args:
        p (Tuple[np.ndarray, np.ndarray]): The predictions and labels.
        all_labels (List[str]): The list of all possible labels.

    Returns:
        Dict[str, float]: The computed metrics (recall, precision, f1_score).
    Ref: https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """
    #Note: seqeval framework for sequence labeling like NER
    
    # Unpack the predictions and labels
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f5': f5_score
    }
    return results

In [10]:
#load
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,        #pretrained model
    num_labels=len(all_labels), #num of unique labels for finetuning
    id2label=id2label,          #dicts for converting in fine tuning
    label2id=label2id,
    ignore_mismatched_sizes=True #pretrained model might have been trained on different num of labels
)

#collate list of sample from dataset into batches. 16 might be benefical for GPU architecture
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

pytorch_model.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#Configure training process
#no validation set specified
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,  # Directory to save checkpoints and logs
    fp16=True,  # mix-precision training on 16 bit to reduce memory and speed up training
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    report_to=REPORT_TO,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    do_eval=DO_EVAL,
    evaluation_strategy=EVALUATION_STRATEGY,
    logging_steps=LOGGING_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
    # Uncomment the following lines if you have defined these variables in your config script
    #learning_rate=LR,
    # save_steps=SAVE_STEPS,
    # logging_dir=LOGGING_DIR,
    # load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
    # metric_for_best_model=METRIC_FOR_BEST_MODEL,
    # greater_is_better=GREATER_IS_BETTER,
    # lr_scheduler_type=LR_SCHEDULER_TYPE,
    # warmup_ratio=WARMUP_RATIO,
    # weight_decay=WEIGHT_DECAY,
)

#inialize trainer for training and evaluation interface
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels), #partial to fix all_label argument
)

In [12]:
%%time

#train model 
trainer.train()



Step,Training Loss
100,0.1186
200,0.0033


CPU times: user 5min 35s, sys: 1min 42s, total: 7min 17s
Wall time: 5min 8s


TrainOutput(global_step=230, training_loss=0.053812924027442935, metrics={'train_runtime': 308.2324, 'train_samples_per_second': 11.92, 'train_steps_per_second': 0.746, 'total_flos': 458247742628256.0, 'train_loss': 0.053812924027442935, 'epoch': 2.0})

In [13]:
trainer.save_model(FINE_TUNED_NAME)
tokenizer.save_pretrained(FINE_TUNED_NAME)

('deberta3_xsmall_pii2d_1024_mini_v1/tokenizer_config.json',
 'deberta3_xsmall_pii2d_1024_mini_v1/special_tokens_map.json',
 'deberta3_xsmall_pii2d_1024_mini_v1/spm.model',
 'deberta3_xsmall_pii2d_1024_mini_v1/added_tokens.json',
 'deberta3_xsmall_pii2d_1024_mini_v1/tokenizer.json')