## Training Notebook
​
# Overview
    - Ran on kaggle CPU
    - Model fined using https://www.kaggle.com/code/jonathankasprisin/piidetect-training-baseline/edit
# To try
I retrained the model with new data selection and data cleaning
Doing this brought the LB score to .888 - Trained in Kaggle Notebook, no tricks or secrets.
I got .890 by adding the trick decscribed here: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470978 https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470978
Adding more data by PJ Mathematician (https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470921) increased to .893

changing lr to 2e-5 (before 5e-5) increased to .903

# Credit
- https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference

## Config and import

In [25]:
#Submission vs Cross Validation Flag
SUBMISSION = False

In [19]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

In [2]:
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
INFERENCE_MODEL_PATH = "/kaggle/input/piidetect-training-baseline/deberta3base_pii2d_1024_v1"
INFERENCE_MAX_LENGTH = 1024
OUTPUT_DIR = "/kaggle/working/"

#print files with pathname
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/pii-detection-removal-from-educational-data/sample_submission.csv
../input/pii-detection-removal-from-educational-data/train.json
../input/pii-detection-removal-from-educational-data/test.json


# Tokenizer from Training

In [6]:
def infer_tokenize(example, tokenizer):
    """
    Tokenize an example for NER using the given tokenizer.

    Args:
        example (dict): A dictionary containing "tokens" and "trailing_whitespace" lists.
            - "tokens": A list of token strings.
            - "trailing_whitespace": A list of boolean values indicating whether each token has trailing whitespace.
        tokenizer: The tokenizer to use for tokenization.

    Returns:
        dict: A dictionary containing tokenized output, including offsets mapping and token map.
            - "input_ids": List of token IDs.
            - "attention_mask": List of attention mask values.
            - "offset_mapping": List of character offsets for each token.
            - "token_map": List mapping each input token to its original position in the example.
            
    Reference: https://www.kaggle.com/code/valentinwerner/893-deberta3base-Inference
    """
    #construct text and mapping of token to text location
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            #no token label if it is white space
            token_map.append(-1)
            
        idx += 1
        
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

# Load Data and Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(INFERENCE_MODEL_PATH)

model = AutoModelForTokenClassification.from_pretrained(INFERENCE_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [28]:
#data from orginal training json
if SUBMISSION:
    data = json.load(open(DATA_PATH+ "/test.json"))
else: 
    #TODO: make validation set "/validation.json"
    data = json.load(open(DATA_PATH+"/train.json"))

#convert to hugging face Dataset object
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data]
})

# Map the tokenize function to your dataset
ds = ds.map(
    infer_tokenize,
    fn_kwargs={      # pass keyword args
        "tokenizer": tokenizer
    }, 
    num_proc=1   #issue with multithreading so went with 1
)

  0%|          | 0/6807 [00:00<?, ?ex/s]

In [14]:
#Configure trainer
training_args = TrainingArguments(
    output_dir= OUTPUT_DIR,  # Directory to save checkpoints and logs
    #fp16 =True,               #mix-precision training on 16 bit to reduce memory and speed up training
    #learning_rate=2e-5,       # intial learning rate
    gradient_accumulation_steps=2,  #how many batches to acculumate gradient before optimization if batch size limited by GPU memory
    report_to="none",        #where training report progress, "none" prevents wandb login
    num_train_epochs=3,      # Number of training epochs
    #per_device_train_batch_size=4,  # Batch size based per GPU
    per_device_eval_batch_size=1,
    #save_steps=500,          # Save model checkpoints every X steps
    do_eval = False,          #whether or not to perform eval during training
    evaluation_strategy="no",    # When to evaluate during training {no, steps or epoch}
    #eval_steps=100,          # Evaluate every X steps if stretegy is "steps"
    #logging_dir=OUTPUT_DIR+"/logs",    # Directory to save training logs
    logging_steps=100,       # Log training progress every X steps
    #load_best_model_at_end=True,   # Load the best model at the end of training
    metric_for_best_model="f5",  # Metric to determine the best model ("accuracy", f1...)
    #greater_is_better=True,      # if higher eval metric is better. True for f1 and acc
    save_total_limit=1,      # how many checkpoints to keep at end (1 means most recent)
    #lr_scheduler_type='cosine', #
    #warmup_ratio=0.1,           #steps to gradually increase learning rate. can help stabalize training at begining
    #weight_decay=0.01,          # l2 regularization to prevent overfitting
    
)

#inialize trainer for training and evaluation interface
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

# Prediction

In [15]:
#get predictions from model
predictions = trainer.predict(ds).predictions

#scale to probilities for interpretability
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

#load id2label configuration from model
config = json.load(open(INFERENCE_MODEL_PATH + "/config.json"))
id2label = config["id2label"]

#Choose label with max probability
preds_final = predictions.argmax(-1)

#To try
# #split predictions of entity to outside entity
# preds = predictions.argmax(-1)
# preds_without_O = pred_softmax[:,:,:12].argmax(-1)
# O_preds = pred_softmax[:,:,12]

# #include NER label if O probability is less than threshold
# threshold = 0.9
# preds_final = np.where(O_preds < threshold, preds_without_O , preds)

# Process preditions and submit

In [20]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

In [21]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(10))


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [22]:
if SUBMISSION:
    df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)