# Training Notebook
​
# Overview
    - Make sure to check input notebooks updated
    - Ran on kaggle GPU
    - deberta_xsmall_Model fined using Mini_no overlap
## To try

## Credit
- https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference

## Imports

In [1]:
#offline installs (need to have from training notebook)
!pip install /kaggle/input/piidetect-training-mini/seqeval-1.2.2.tar.gz -q

In [2]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-03-08 22:59:34.272309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 22:59:34.272454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 22:59:34.551953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Configuration

In [3]:
VAL = True #run validation checks

DATA_PATH = '../input/pii-detection-removal-from-educational-data'
VAL_PATH = '/kaggle/input/pii-detect-miniset-and-validation-ds/val.json'

INFERENCE_MODEL_PATH = "/kaggle/input/piidetect-training-mini/deberta3_xsmall_pii2d_1024_mini_v1"
INFERENCE_MAX_LENGTH = 1024
OUTPUT_DIR = "/kaggle/working/"


#print files with pathname
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/pii-detection-removal-from-educational-data/sample_submission.csv
../input/pii-detection-removal-from-educational-data/train.json
../input/pii-detection-removal-from-educational-data/test.json


## Helper functions

# Tokenizer from Training

In [4]:
def infer_tokenize(example, tokenizer):
    """
    Tokenize an example for NER using the given tokenizer.

    Args:
        example (dict): A dictionary containing "tokens" and "trailing_whitespace" lists.
            - "tokens": A list of token strings.
            - "trailing_whitespace": A list of boolean values indicating whether each token has trailing whitespace.
        tokenizer: The tokenizer to use for tokenization.

    Returns:
        dict: A dictionary containing tokenized output, including offsets mapping and token map.
            - "input_ids": List of token IDs.
            - "attention_mask": List of attention mask values.
            - "offset_mapping": List of character offsets for each token.
            - "token_map": List mapping each input token to its original position in the example.
            
    Reference: https://www.kaggle.com/code/valentinwerner/893-deberta3base-Inference
    """
    #empty list to store text and tokens in respective map
    text = []
    token_map = []
    
    #keep track of tokens
    idx = 0
    
    #for the example go through tokens and whitespace
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        #add token to text
        text.append(t)
        #extend token length number of idx
        token_map.extend([idx]*len(t))
        #for whitespace add a space to text and label -1 in token map
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    #Tokenize the text and return offset mapping with the token map    
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

# Load Data and Model

In [5]:
#helper function for loading to dataset
def load_process_ds_helper(json_path, tokenizer,val = False):
    # Load data from JSON file
    data = json.load(open(json_path))

    if not val:
        # Convert data to Hugging Face Dataset object
        ds = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [str(x["document"]) for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data]
        })
    #inlcude provided lables if val
    else:
        ds = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [str(x["document"]) for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data],
            "labels": [x["labels"] for x in data]
        })

    # Map the tokenize function to the Dataset
    ds = ds.map(
        infer_tokenize,
        fn_kwargs={      # pass keyword args
            "tokenizer": tokenizer
        }, 
        num_proc=2   #issue with multithreading so went with 2
    )

    return ds

In [6]:
tokenizer = AutoTokenizer.from_pretrained(INFERENCE_MODEL_PATH)

model = AutoModelForTokenClassification.from_pretrained(INFERENCE_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# Load id2label configuration from model
config = json.load(open(INFERENCE_MODEL_PATH + "/config.json"))
id2label = config["id2label"]

In [7]:
#load and proces (tokenize) data to dataset

sub_ds =load_process_ds_helper((DATA_PATH+ "/test.json"), tokenizer)

if VAL:
    val_gt_ds =load_process_ds_helper(VAL_PATH, tokenizer, True)
    val_labels = val_gt_ds["labels"]
    # Create val_ds with everything except "provided_labels" ->TODO:check if it affects predicting vs training
    val_ds = val_gt_ds.remove_columns("labels")

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

   

#0:   0%|          | 0/344 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/343 [00:00<?, ?ex/s]

In [8]:
#Configure trainer
training_args = TrainingArguments(
    output_dir= OUTPUT_DIR,  # Directory to save checkpoints and logs
    fp16 =False,               #mix-precision training on 16 bit to reduce memory and speed up training
    report_to="none",        #where training report progress, "none" prevents wandb login
    per_device_eval_batch_size=1 
)

#inialize trainer for training and evaluation interface
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

# Prediction

In [9]:
#prediction helper function
from scipy.special import softmax
import json
import numpy as np

def predict_helper(trainer, ds, model_path, threshold=0.9):
    # Get predictions from model
    predictions = trainer.predict(ds).predictions

    # Scale long last dimension to probabilities for interpretability
    pred_softmax = softmax(predictions, axis=-1)

    #Choose label with max probability
    preds_final = predictions.argmax(-1)

    return preds_final

In [10]:
%%time
#validation
if VAL:
    preds_final_val = predict_helper(
        trainer,
        val_ds,
        INFERENCE_MODEL_PATH, 
        threshold=0.9
)

CPU times: user 43.1 s, sys: 5.21 s, total: 48.3 s
Wall time: 34.5 s


In [11]:
%%time
#compitition
preds_final_sub = predict_helper(
    trainer,
    sub_ds,
    INFERENCE_MODEL_PATH,
    threshold=0.9
)

CPU times: user 598 ms, sys: 44.6 ms, total: 643 ms
Wall time: 476 ms


In [12]:
print(len(sub_ds))
print(len(val_ds))

10
687


# Process preditions and submit

In [13]:
import pandas as pd

def create_submission_df(preds_final_sub, sub_ds, id2label):
    """
    Create a DataFrame of submission information.

    Parameters:
    - preds_final_sub (list): List of predictions.
    - sub_ds (Dataset): Dataset containing the token maps, offset mappings, tokens, and documents.
    - id2label (dict): Dictionary mapping label IDs to labels.

    Returns:
    - DataFrame: DataFrame containing the submission information.
    """
    # Create lists of submission information
    triplets = []
    document, token, label, token_str = [], [], [], []

    for p, token_map, offsets, tokens, doc in zip(preds_final_sub, sub_ds["token_map"], sub_ds["offset_mapping"], sub_ds["tokens"], sub_ds["document"]):
        # Iterate through each prediction and its offset
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]  # Predicted label

            if start_idx + end_idx == 0: continue   # For special token or padding token

            if token_map[start_idx] == -1:  # Label is for whitespace so go to next
                start_idx += 1

            # Ignore leading whitespace token "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1
            
            # Break if index exceeds the length of token mapping
            if start_idx >= len(token_map): break
            
            token_id = token_map[start_idx]  # Token ID at start of index

            # Ignore "O" predictions and whitespace predictions
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                # Add triplet if not in list of triplets
                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    # Create a DataFrame of submission information
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    return df

In [14]:
df = create_submission_df(preds_final_sub, sub_ds, id2label)

#assign each row a unique row id
df["row_id"] = list(range(len(df)))
display(df.head(10))


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,9,I-NAME_STUDENT,Nathalie,1
2,7,10,I-NAME_STUDENT,Sylla,2
3,7,482,B-NAME_STUDENT,Nathalie,3
4,7,482,I-NAME_STUDENT,Nathalie,4
5,7,483,B-NAME_STUDENT,Sylla,5
6,7,483,I-NAME_STUDENT,Sylla,6
7,7,741,B-NAME_STUDENT,Nathalie,7
8,7,741,I-NAME_STUDENT,Nathalie,8
9,7,742,I-NAME_STUDENT,Sylla,9


In [15]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

# Validation Checks

In [16]:
#test Metrics
def pii_metrics_score(pred_df, gt_df, beta=5):
    """
    Calculate print the Precision, Recall and Micro F-beta score for predicted PII labels. Determines which were false negatives

    Parameters:
    - pred_df (DataFrame): DataFrame containing predicted PII labels ["row_id", "document", "token", "label"].
    - gt_df (DataFrame): DataFrame containing ground truth PII labels ["row_id", "document", "token", "label"].
    - beta (float): The beta parameter for the F-beta score, controlling the trade-off between precision and recall.

    Returns:
    - results (dict): Dictionary containing the precision, recall, and F-beta score.
    """   
    # Merge the predicted and ground truth DataFrames on 'document' and 'token' columns
    df = pred_df.merge(gt_df, how='outer', on=['document', "token"], suffixes=('_pred', '_gt'))

    # Initialize a new column 'compare' with empty strings
    df['compare'] = ""

    # If the predicted label is missing or doesn't match the ground truth, it's a false negative
    df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred), 'compare'] = "FN"
    
    # If the ground truth label is missing, it's a false positive
    df.loc[df.label_gt.isna(), 'compare'] = "FP"

    # If the predicted label matches the ground truth, it's a true positive
    df.loc[(df.label_pred.notna()) & (df.label_gt == df.label_pred), 'compare'] = "TP"
    
    # Count the number of false positives, false negatives, and true positives
    FP = (df['compare'] == "FP").sum()
    FN = (df['compare'] == "FN").sum()
    TP = (df['compare'] == "TP").sum()


    # Calculate the precision, recall, and F-beta score
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    fbeta_mircro_score = (1 + (beta**2)) * precision * recall / (((beta**2) * precision) + recall) if precision + recall > 0 else 0

    # Get a DataFrame of false negatives
    fn_df = df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred)]
    
    # Print the precision, recall, and F-beta score
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-beta score: {fbeta_mircro_score}")
    
    return fn_df

In [17]:
import pandas as pd

def determine_metrics_preds_gt(preds_final, gt_ds, id2label, beta=5):
    """
    Create a DataFrame of submission information.

    Parameters:
    - preds_final (list): List of predictions.
    - id2label (dict): Dictionary mapping label IDs to labels.
    - gt_ds (Dataset): Dataset containing the token maps, offset mappings, tokens, gt_labels, and documents.

    Returns:
    - DataFrame: DataFrame containing the submission information.
    """
    # Create lists of submission information
    triplets = []
    document, token, p_label, gt_label, token_str, compare = [], [], [], [], [], []

    for p, gt_labels, token_map, offsets, tokens, doc in zip(preds_final, gt_ds["labels"], gt_ds["token_map"], gt_ds["offset_mapping"], gt_ds["tokens"], gt_ds["document"]):
        # Iterate through each label and its offset
        for label_pred, label_gt, (start_idx, end_idx) in zip(p, gt_labels, offsets):
            label_pred = id2label[str(label_pred)]  # Predicted label

            if start_idx + end_idx == 0: continue   # For special token or padding token

            if token_map[start_idx] == -1:  # Label is for whitespace so go to next
                start_idx += 1

            # Ignore leading whitespace token "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1
            
            # Break if index exceeds the length of token mapping
            if start_idx >= len(token_map): break
            
            token_id = token_map[start_idx]  # Token ID at start of index

            # Ignore "O" labels and whitespace labels
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])
                
                # If the ground truth label is missing, it's a false positive
                if label_gt == "O" and label_pred !="O": match = "FP"
                    
                # If the predicted label is missing or doesn't match the ground truth, it's a false negative
                elif (label_pred is None) or (label_gt != label_pred) : match = "FN"
    
                # If the ground truth label is missing, it's a false positive
                elif label_gt == "O" and label_pred !="O": match = "FP"

                # If the predicted label matches the ground truth, it's a true positive
                elif label_gt == label_pred : match = "TP"
                    
                else: match = "?"

                # Add triplet if not in list of triplets
                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    p_label.append(label_pred)
                    gt_label.append(label_gt)
                    token_str.append(tokens[token_id])
                    compare.append(match)
                    triplets.append(triplet)

    # Create a DataFrame of submission information
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "pred_label": p_label,
        "gt_label": gt_label,
        "token_str": token_str,
        "compare": compare
    })
    
    # Count the number of false positives, false negatives, and true positives
    FP = (df['compare'] == "FP").sum()
    FN = (df['compare'] == "FN").sum()
    TP = (df['compare'] == "TP").sum()
    
    # Calculate the precision, recall, and F-beta score
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    fbeta_mircro_score = (1 + (beta**2)) * precision * recall / (((beta**2) * precision) + recall) if precision + recall > 0 else 0
    
    # Print the precision, recall, and F-beta score
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-beta score: {fbeta_mircro_score}")
    
    return df

In [18]:
print("submission sample metrics:")
pred_df = pd.read_csv('/kaggle/working/submission.csv')
gt_df= pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')
fn_df = pii_metrics_score(pred_df, gt_df, beta=5)

fn_df.head()

submission sample metrics:
Precision: 0.8387096774193549
Recall: 0.7878787878787878
F-beta score: 0.7897196261682243


Unnamed: 0,row_id_pred,document,token,label_pred,row_id_gt,label_gt,compare
1,1,7,9,I-NAME_STUDENT,0.0,B-NAME_STUDENT,FN
4,4,7,482,I-NAME_STUDENT,2.0,B-NAME_STUDENT,FN
5,5,7,483,B-NAME_STUDENT,3.0,I-NAME_STUDENT,FN
8,8,7,741,I-NAME_STUDENT,4.0,B-NAME_STUDENT,FN
17,17,20,5,I-NAME_STUDENT,12.0,B-NAME_STUDENT,FN


In [19]:
print("validation sample metrics:")
val_compare_df= determine_metrics_preds_gt(preds_final_val, val_gt_ds, id2label)

validation sample metrics:
Precision: 0.16877637130801687
Recall: 0.3883495145631068
F-beta score: 0.36984352773826457


In [20]:
# Count the number of false positives, false negatives, and true positives
print("FP: ", (val_compare_df['compare'] == "FP").sum()) 
print("FN: ", (val_compare_df['compare'] == "FN").sum()) 
print("TP: ", (val_compare_df['compare'] == "TP").sum())
print("?: ", (val_compare_df['compare'] == "?").sum())

FP:  197
FN:  63
TP:  40
?:  0


In [21]:
val_compare_df[val_compare_df['compare'] == "FP"].head()

Unnamed: 0,document,token,pred_label,gt_label,token_str,compare
3,10494,1,I-NAME_STUDENT,O,Elnemr,FP
5,8593,1,I-NAME_STUDENT,O,Flores,FP
6,8593,547,B-NAME_STUDENT,O,Tony,FP
7,8593,548,I-NAME_STUDENT,O,Flores,FP
8,4899,7,B-NAME_STUDENT,O,Maria,FP


In [22]:
val_compare_df[val_compare_df['compare'] == "FN"].head()

Unnamed: 0,document,token,pred_label,gt_label,token_str,compare
2,10494,0,B-NAME_STUDENT,I-NAME_STUDENT,Ahmed,FN
4,8593,0,B-NAME_STUDENT,I-NAME_STUDENT,Tony,FN
12,13131,4,B-NAME_STUDENT,I-NAME_STUDENT,Andreas,FN
14,8612,6,B-NAME_STUDENT,I-NAME_STUDENT,Ana,FN
17,8229,0,B-NAME_STUDENT,I-NAME_STUDENT,Martina,FN


In [23]:
val_compare_df[val_compare_df['compare'] == "TP"].head()

Unnamed: 0,document,token,pred_label,gt_label,token_str,compare
0,11144,5,B-NAME_STUDENT,B-NAME_STUDENT,Paola,TP
1,11144,6,I-NAME_STUDENT,I-NAME_STUDENT,Garcia,TP
23,4936,5,B-NAME_STUDENT,B-NAME_STUDENT,Edjanio,TP
24,4936,5,I-NAME_STUDENT,I-NAME_STUDENT,Edjanio,TP
51,1325,14,I-NAME_STUDENT,I-NAME_STUDENT,Sjoerd,TP
