## Training Notebook

# Overview
    - Trained on kaggle GPU P100
# To try

https://www.kaggle.com/code/jonathankasprisin/915-deberta3base-training-test/edit ->

- Downsampling negative samples (samples without labels, but they possible still work as examples where names should not be tagged as name)
- Adding @moths external data: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/469493
- Adding PJMathematicianss external data: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470921
- However, I used my cleaned version instead (the punctuation is flawed in the original data set at the time of this trainign): https://www.kaggle.com/code/valentinwerner/fix-punctuation-tokenization-external-dataset

# Config and import

- Try: max length adjust


In [None]:
!pip install seqeval evaluate -q

In [None]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np

In [None]:
!pip install kaggle

#connect to kaggle API to run on local machine
os.environ['KAGGLE_USERNAME'] = "kaggle_username" # username from the json file
os.environ['KAGGLE_KEY'] = "kaggle_key
!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus
!unzip entity-annotated-corpus.zip



In [None]:
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base" #/kaggle/input/deberta_v3/keras/deberta_v3_base_en/2"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "../output"

#print files with pathname
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Selection
To try
- down sample of negative examples
- augment data

In [None]:
#data from orginal training json
data = json.load(open(DATA_PATH+ "/train.json"))

print("Training Data: ", len(data))

In [None]:
#Set up labeling for NER with #Targets: B-Beginning entity, I-inside entity, O- outside entity

#Extract all unique labels w/ list comprehension. Use chain to flatten list of lists
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))

#Create dictionary of label to id
label2id = {l: i for i,l in enumerate(all_labels)}

#Create dictionary of id to label
id2label = {v:k for k,v in label2id.items()}

#target labels identified in the training data
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

# Tokenize Data

In [None]:
#prep data for NER training by tokenize the text and align labels to tokens
def tokenize(example, tokenizer, label2id, max_length):
    """This function ensures that the text is correctly tokenized and the labels 
    are correctly aligned with the tokens for NER training.

    Args:
        example (dict): The example containing the text and labels.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: The tokenized example with aligned labels.

    Reference: credit to https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """

    # rebuild text from tokens
    text = []
    labels = []

    #iterate through tokens, labels, and trailing whitespace using zip to create tuple from three lists
    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        
        #extend so we can add multiple elements to end of list if ws
        labels.extend([l] * len(t))
        if ws:
            text.append(" ")
            labels.append("O")

    #Tokenize text and return offsets for start and end character position. Limit length of tokenized text.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length, truncation=True)

    #convert to np array for indexing
    labels = np.array(labels)

    # join text list into a single string 
    text = "".join(text)
    token_labels = []

    #iterate through each tolken
    for start_idx, end_idx in tokenized.offset_mapping:
        #if special tolken (CLS token) then append O
        #CLS : classification token added to the start of each sequence
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        #append orginal label to token_labels
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [None]:
#load tokenizer based on pretrained model
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

#convert to hugging face Dataset object
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

# Map the tokenize function to your dataset
ds = ds.map(
    tokenize,
    fn_kwargs={      # pass keyword args
        "tokenizer": tokenizer,
        "label2id": label2id,
        "max_length": TRAINING_MAX_LENGTH
    }, 
    num_proc=3   #apply in paralell using 3 processes
)

#todo check for UNK token due to fast tokenizer

In [None]:
#TEMP check token and labels from first example before and after tokenzization 

#get first example from ds
x = ds[0]

# for before tokenization print all tokens that are not outside an entity
for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

#print **** to seperate 
print("*"*10)

#print all tokens and label after tokenization
for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

# Metrics and Training

In [None]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score


def compute_metrics(p, all_labels):
    """Compute the F1, recall, precision metrics for a NER task.

    Args:
        p (Tuple[np.ndarray, np.ndarray]): The predictions and labels.
        all_labels (List[str]): The list of all possible labels.

    Returns:
        Dict[str, float]: The computed metrics (recall, precision, f1_score).
    Ref: https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """
    #Note: seqeval framework for sequence labeling like NER
    
    # Unpack the predictions and labels
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f5': f5_score
    }
    return results

In [None]:
#load
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,        #pretrained model
    num_labels=len(all_labels), #num of unique labels for finetuning
    id2label=id2label,          #dicts for converting in fine tuning
    label2id=label2id,
    ignore_mismatched_sizes=True #pretrained model might have been trained on different num of labels
)

#collate list of sample from dataset into batches. 16 might be benefical for GPU architecture
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [None]:
#TODO - do we need evaluation set 

# final_ds = ds.train_test_split(test_size=0.2, seed=42) # cannot use stratify_by_column='group'
# final_ds

In [None]:
#Configure training process
#no validation set specified
training_args = TrainingArguments(
    output_dir= OUTPUT_DIR,  # Directory to save checkpoints and logs
    #fp16 =True,               #mix-precision training on 16 bit to reduce memory and speed up training
    #learning_rate=2e-5,       # intial learning rate
    gradient_accumulation_steps=2,  #how many batches to acculumate gradient before optimization if batch size limited by GPU memory
    report_to="none",        #where training report progress, "none" prevents wandb login
    num_train_epochs=3,      # Number of training epochs
    per_device_train_batch_size=4,  # Batch size based per GPU
    #save_steps=500,          # Save model checkpoints every X steps
    do_eval = False,          #whether or not to perform eval during training
    evaluation_strategy="no",    # When to evaluate during training {no, steps or epoch}
    #eval_steps=100,          # Evaluate every X steps if stretegy is "steps"
    #logging_dir=OUTPUT_DIR+"/logs",    # Directory to save training logs
    logging_steps=100,       # Log training progress every X steps
    #load_best_model_at_end=True,   # Load the best model at the end of training
    metric_for_best_model="f5",  # Metric to determine the best model ("accuracy", f1...)
    #greater_is_better=True,      # if higher eval metric is better. True for f1 and acc
    save_total_limit=1,      # how many checkpoints to keep at end (1 means most recent)
    #lr_scheduler_type='cosine', #
    #warmup_ratio=0.1,           #steps to gradually increase learning rate. can help stabalize training at begining
    #weight_decay=0.01,          # l2 regularization to prevent overfitting
    
)

#inialize trainer for training and evaluation interface
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels), #partial to fix all_label argument
)

In [None]:
#measure execution time if cpu
#%%time

#train model 
trainer.train()

In [None]:
trainer.save_model("deberta3base_pii2d_1024_v1")
tokenizer.save_pretrained("deberta3base_pii2d_1024_v1")