## Training Notebook
​
# Overview
    - Ran on kaggle GPU
    - Model fined using Mini_no overlap
# To try

# Credit
- https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference

## Config and import

In [1]:
#offline installs (need to have from training notebook)
!pip install /kaggle/input/piidetect-training-mini/seqeval-1.2.2.tar.gz -q

In [2]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-03-07 18:53:26.471238: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 18:53:26.471347: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 18:53:26.640737: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
VAL_PATH = '/kaggle/input/pii-detect-miniset-and-validation-ds/val.json'
INFERENCE_MODEL_PATH = "/kaggle/input/piidetect-training-mini/deberta3base_pii2d_1024_miniv1"
INFERENCE_MAX_LENGTH = 1024
OUTPUT_DIR = "/kaggle/working/"
VAL = True

#print files with pathname
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/pii-detection-removal-from-educational-data/sample_submission.csv
../input/pii-detection-removal-from-educational-data/train.json
../input/pii-detection-removal-from-educational-data/test.json


## Helper functions

# Tokenizer from Training

In [4]:
def infer_tokenize(example, tokenizer):
    """
    Tokenize an example for NER using the given tokenizer.

    Args:
        example (dict): A dictionary containing "tokens" and "trailing_whitespace" lists.
            - "tokens": A list of token strings.
            - "trailing_whitespace": A list of boolean values indicating whether each token has trailing whitespace.
        tokenizer: The tokenizer to use for tokenization.

    Returns:
        dict: A dictionary containing tokenized output, including offsets mapping and token map.
            - "input_ids": List of token IDs.
            - "attention_mask": List of attention mask values.
            - "offset_mapping": List of character offsets for each token.
            - "token_map": List mapping each input token to its original position in the example.
            
    Reference: https://www.kaggle.com/code/valentinwerner/893-deberta3base-Inference
    """
    #empty list to store text and tokens in respective map
    text = []
    token_map = []
    
    #keep track of tokens
    idx = 0
    
    #for the example go through tokens and whitespace
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        #add token to text
        text.append(t)
        #extend token length number of idx
        token_map.extend([idx]*len(t))
        #for whitespace add a space to text and label -1 in token map
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    #Tokenize the text and return offset mapping with the token map    
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

# Load Data and Model

In [5]:
#helper function for loading to dataset
def load_process_ds_helper(json_path, tokenizer,val = False):
    # Load data from JSON file
    data = json.load(open(json_path))

    if not val:
        # Convert data to Hugging Face Dataset object
        ds = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [str(x["document"]) for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data]
        })
    #inlcude provided lables if val
    else:
        ds = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [str(x["document"]) for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data],
            "provided_labels": [x["provided_labels"] for x in data]
        })

    # Map the tokenize function to the Dataset
    ds = ds.map(
        infer_tokenize,
        fn_kwargs={      # pass keyword args
            "tokenizer": tokenizer
        }, 
        num_proc=2   #issue with multithreading so went with 2
    )

    return ds

In [6]:
tokenizer = AutoTokenizer.from_pretrained(INFERENCE_MODEL_PATH)

model = AutoModelForTokenClassification.from_pretrained(INFERENCE_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# Load id2label configuration from model
config = json.load(open(INFERENCE_MODEL_PATH + "/config.json"))
id2label = config["id2label"]

In [7]:
#load and proces (tokenize) data to dataset

sub_ds =load_process_ds_helper((DATA_PATH+ "/test.json"), tokenizer)
if VAL:
    val_gt_ds =load_process_ds_helper(VAL_PATH, tokenizer, True)
    val_labels = val_gt_ds["provided_labels"]
    # Create val_ds with everything except "provided_labels" ->TODO:check if it affects predicting vs training
    val_ds = val_gt_ds.remove_columns("provided_labels")

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

   

#0:   0%|          | 0/344 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/344 [00:00<?, ?ex/s]

In [8]:
#Configure trainer
training_args = TrainingArguments(
    output_dir= OUTPUT_DIR,  # Directory to save checkpoints and logs
    fp16 =False,               #mix-precision training on 16 bit to reduce memory and speed up training
    report_to="none",        #where training report progress, "none" prevents wandb login
    per_device_eval_batch_size=1,
    do_eval = False,          #whether or not to perform eval during training
    evaluation_strategy="no",    # When to evaluate during training {no, steps or epoch}  
)

#inialize trainer for training and evaluation interface
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

# Prediction

In [9]:
#prediction helper function
from scipy.special import softmax
import json
import numpy as np

def predict_helper(trainer, ds, model_path, threshold=0.9):
    # Get predictions from model
    predictions = trainer.predict(ds).predictions

    # Scale long last dimension to probabilities for interpretability
    pred_softmax = softmax(predictions, axis=-1)

    #Choose label with max probability
    preds_final = predictions.argmax(-1)

    return preds_final

In [10]:
%%time
#validation
if VAL:
    preds_final_val = predict_helper(
        trainer,
        val_ds,
        INFERENCE_MODEL_PATH, 
        threshold=0.9
)

CPU times: user 1min 28s, sys: 4.68 s, total: 1min 33s
Wall time: 1min 5s


In [11]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

#validation results
if VAL:
    possible_labels = [
        'B-NAME_STUDENT', 'B-EMAIL','B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM',
        'B-URL_PERSONAL', 'B-STREET_ADDRESS',
        'I-NAME_STUDENT', 'I-EMAIL','B-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM',
        'I-URL_PERSONAL', 'I-STREET_ADDRESS', 'O'
    ]

    
    
preds_str = []
true_labels_str = []

for p, true_labels_seq, token_map, offsets, tokens, doc in zip(preds_final_val, val_labels, val_ds["token_map"], val_ds["offset_mapping"], val_ds["tokens"], val_ds["document"]):
    preds_str_doc = []
    true_labels_str_doc = []

    # iterate through each prediction, true label and its offset
    for token_pred, true_label, (start_idx, end_idx) in zip(p, true_labels_seq, offsets):
        label_pred = id2label[str(token_pred)]  #predicted label

        if start_idx + end_idx == 0: continue   #for special token or padding token

        if token_map[start_idx] == -1:  #label is for ws so go to next
            start_idx += 1

        # ignore leading white space token "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
            
        #break if index exceeds the length of token mapping
        if start_idx >= len(token_map): break
        
        token_id = token_map[start_idx]  #token ID at start of index

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            preds_str_doc.append(label_pred)
            true_labels_str_doc.append(true_label)

    preds_str.append(preds_str_doc)
    true_labels_str.append(true_labels_str_doc)
    

# Calculate precision and recall
precision = precision_score(true_labels_str, preds_str)
recall = recall_score(true_labels_str, preds_str)

# Calculate F5 score
f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-10)  # add a small constant to prevent division by zero

print("Validation scores: ")
print("recall: ", recall)
print('precision: ', precision)    
print('f5: ', f5_score)    


Validation scores: 
recall:  0.26373626373626374
precision:  0.03274215552523874
f5:  0.20744680849147093


In [12]:
%%time
#compitition
preds_final_sub = predict_helper(
    trainer,
    sub_ds,
    INFERENCE_MODEL_PATH,
    threshold=0.9
)

CPU times: user 1.36 s, sys: 36.2 ms, total: 1.39 s
Wall time: 1.01 s


In [13]:
print(len(sub_ds))
print(len(val_ds))

10
688


# Process preditions and submit

In [14]:
import pandas as pd

def create_submission_df(preds_final_sub, sub_ds, id2label):
    """
    Create a DataFrame of submission information.

    Parameters:
    - preds_final_sub (list): List of predictions.
    - sub_ds (Dataset): Dataset containing the token maps, offset mappings, tokens, and documents.
    - id2label (dict): Dictionary mapping label IDs to labels.

    Returns:
    - DataFrame: DataFrame containing the submission information.
    """
    # Create lists of submission information
    triplets = []
    document, token, label, token_str = [], [], [], []

    for p, token_map, offsets, tokens, doc in zip(preds_final_sub, sub_ds["token_map"], sub_ds["offset_mapping"], sub_ds["tokens"], sub_ds["document"]):
        # Iterate through each prediction and its offset
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]  # Predicted label

            if start_idx + end_idx == 0: continue   # For special token or padding token

            if token_map[start_idx] == -1:  # Label is for whitespace so go to next
                start_idx += 1

            # Ignore leading whitespace token "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1
            
            # Break if index exceeds the length of token mapping
            if start_idx >= len(token_map): break
            
            token_id = token_map[start_idx]  # Token ID at start of index

            # Ignore "O" predictions and whitespace predictions
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                # Add triplet if not in list of triplets
                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    # Create a DataFrame of submission information
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    return df

In [15]:
df = create_submission_df(preds_final_sub, sub_ds, id2label)

#assign each row a unique row id
df["row_id"] = list(range(len(df)))
display(df.head(10))


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [16]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

In [17]:
#test Metrics
def pii_metrics_score(pred_df, gt_df, beta=5):
    """
    Calculate print the Precision, Recall and Micro F-beta score for predicted PII labels. Determines which were false negatives

    Parameters:
    - pred_df (DataFrame): DataFrame containing predicted PII labels ["row_id", "document", "token", "label"].
    - gt_df (DataFrame): DataFrame containing ground truth PII labels ["row_id", "document", "token", "label"].
    - beta (float): The beta parameter for the F-beta score, controlling the trade-off between precision and recall.

    Returns:
    - results (dict): Dictionary containing the precision, recall, and F-beta score.
    """   
    # Merge the predicted and ground truth DataFrames on 'document' and 'token' columns
    df = pred_df.merge(gt_df, how='outer', on=['document', "token"], suffixes=('_pred', '_gt'))

    # Initialize a new column 'compare' with empty strings
    df['compare'] = ""

    # If the predicted label is missing or doesn't match the ground truth, it's a false negative
    df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred), 'compare'] = "FN"
    
    # If the ground truth label is missing, it's a false positive
    df.loc[df.label_gt.isna(), 'compare'] = "FP"

    # If the predicted label matches the ground truth, it's a true positive
    df.loc[(df.label_pred.notna()) & (df.label_gt == df.label_pred), 'compare'] = "TP"
    
    # Count the number of false positives, false negatives, and true positives
    FP = (df['compare'] == "FP").sum()
    FN = (df['compare'] == "FN").sum()
    TP = (df['compare'] == "TP").sum()


    # Calculate the precision, recall, and F-beta score
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    fbeta_mircro_score = (1 + (beta**2)) * precision * recall / (((beta**2) * precision) + recall) if precision + recall > 0 else 0

    # Get a DataFrame of false negatives
    fn_df = df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred)]
    
    # Print the precision, recall, and F-beta score
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-beta score: {fbeta_mircro_score}")
    
    return fn_df

In [18]:
print("submission sample metrics:")
pred_df = pd.read_csv('/kaggle/working/submission.csv')
gt_df= pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')
fn_df = pii_metrics_score(pred_df, gt_df, beta=5)

fn_df.head()

submission sample metrics:
Precision: 0.9629629629629629
Recall: 1.0
F-beta score: 0.9985228951255539


Unnamed: 0,row_id_pred,document,token,label_pred,row_id_gt,label_gt,compare
14,14,20,8,I-NAME_STUDENT,,,FP


In [19]:
import pandas as pd

def create_sub_format_val_df(ds):
    """
    Create a DataFrame of submission information.

    Parameters:
    - ds (Dataset): Dataset containing the token maps, offset mappings, tokens, and documents.

    Returns:
    - DataFrame: DataFrame containing the submission information.
    """
    # Create lists of submission information
    triplets = []
    document, token, label, token_str = [], [], [], []

    for p, token_map, offsets, tokens, doc in zip(ds["provided_labels"], ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
        # Iterate through each label and its offset
        for label_pred, (start_idx, end_idx) in zip(p, offsets):

            if start_idx + end_idx == 0: continue   # For special token or padding token

            if token_map[start_idx] == -1:  # Label is for whitespace so go to next
                start_idx += 1

            # Ignore leading whitespace token "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1
            
            # Break if index exceeds the length of token mapping
            if start_idx >= len(token_map): break
            
            token_id = token_map[start_idx]  # Token ID at start of index

            # Ignore "O" labels and whitespace labels
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                # Add triplet if not in list of triplets
                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    # Create a DataFrame of submission information
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    return df

In [20]:
print("submission sample metrics:")
val_pred_df=create_submission_df(preds_final_val, val_ds, id2label)
gt_df= create_sub_format_val_df(val_gt_ds)


submission sample metrics:


In [21]:
val_pred_df.head()


Unnamed: 0,document,token,label,token_str
0,11144,5,B-NAME_STUDENT,Paola
1,11144,6,I-NAME_STUDENT,Garcia
2,10494,0,B-NAME_STUDENT,Ahmed
3,10494,1,I-NAME_STUDENT,Elnemr
4,8593,0,B-NAME_STUDENT,Tony


In [22]:
gt_df.head()

Unnamed: 0,document,token,label,token_str
0,11144,5,B-NAME_STUDENT,Paola
1,11144,6,I-NAME_STUDENT,Garcia
2,10494,0,I-NAME_STUDENT,Ahmed
3,8593,0,I-NAME_STUDENT,Tony
4,8593,569,B-NAME_STUDENT,I


In [23]:
fn_df = pii_metrics_score(val_pred_df, gt_df, beta=5)

fn_df.head()

Precision: 0.17647058823529413
Recall: 0.18972332015810275
F-beta score: 0.18917689859026832


Unnamed: 0,document,token,label_pred,token_str_pred,label_gt,token_str_gt,compare
0,10220,562,B-NAME_STUDENT,Paolina,,,FP
1,10220,736,B-NAME_STUDENT,Maria,,,FP
2,10220,795,B-NAME_STUDENT,Maria,,,FP
4,10418,98,B-URL_PERSONAL,http://beck-calhoun.biz/bloghome.jsp,,,FP
5,10472,0,B-NAME_STUDENT,Cardo,I-NAME_STUDENT,Cardo,FN
