# PII W and B Inference
REference https://www.kaggle.com/code/thedrcat/pii-data-detection-infer-with-w-b

# Config

In [1]:
INFERENCE_MODEL_PATH = '/kaggle/input/pii-wandb-training/pii002'
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
VAL_PATH = '/kaggle/input/pii-wandb-prep/val.json'
INFERENCE_MAX_LENGTH = 1024
OUTPUT_DIR = "/kaggle/working/"
THRESHOLD=0.9

# Imports

In [2]:
from pathlib import Path
import os
import json
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-04-10 17:36:31.455329: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-10 17:36:31.455458: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-10 17:36:31.711401: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Util Functions

# data Functions

In [3]:
def add_token_indices(doc_tokens):
    token_indices = list(range(len(doc_tokens)))
    return token_indices

# helpers

# From Training Helpers


In [4]:
def infer_tokenize(example, tokenizer):
    """
    Tokenize an example for NER using the given tokenizer.

    Args:
        example (dict): A dictionary containing "tokens" and "trailing_whitespace" lists.
            - "tokens": A list of token strings.
            - "trailing_whitespace": A list of boolean values indicating whether each token has trailing whitespace.
        tokenizer: The tokenizer to use for tokenization.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: A dictionary containing tokenized output, including offsets mapping and token map.
            - "input_ids": List of token IDs.
            - "attention_mask": List of attention mask values.
            - "offset_mapping": List of character offsets for each token.
            - "token_map": List mapping each input token to its original position in the example.
            
    Reference: https://www.kaggle.com/code/valentinwerner/893-deberta3base-Inference
    """
    #empty list to store text and tokens in respective map
    text = []
    token_map = []
    
    #keep track of tokens
    idx = 0
    
    #for the example go through tokens and whitespace
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        #add token to text
        text.append(t)
        #extend token length number of idx
        token_map.extend([idx]*len(t))
        #for whitespace add a space to text and label -1 in token map
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    #Tokenize the text and return offset mapping with the token map    
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    length = len(tokenized.input_ids)
        
    return {
        **tokenized,
        "length": length,
        "token_map": token_map,
    }

def create_dataset(data, tokenizer, max_length):
    ds = Dataset.from_dict({
        "full_text": data.full_text.tolist(),
        "document": data.document.tolist(),
        "tokens": data.tokens.tolist(),
        "trailing_whitespace": data.trailing_whitespace.tolist(),
        "token_indices": data.token_indices.tolist(),
    })
    ds = ds.map(
        infer_tokenize,
        fn_kwargs={"tokenizer": tokenizer,
                   # "max_length": max_length #CHECK
                  }, 
        num_proc=3
    )
    return ds

# Load and predict

In [5]:
tokenizer = AutoTokenizer.from_pretrained(INFERENCE_MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(INFERENCE_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# Load id2label configuration from model
config = json.load(open(INFERENCE_MODEL_PATH + "/config.json"))
id2label = config["id2label"]

In [6]:
import json
import pandas as pd

test_data = json.load(open(DATA_PATH + "/test.json"))
sub_df = pd.DataFrame(test_data)

sub_df['token_indices'] = sub_df['tokens'].apply(add_token_indices)
sub_ds = create_dataset(sub_df, tokenizer, INFERENCE_MAX_LENGTH)

Map (num_proc=3):   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
from transformers import Trainer

#CHECK no training_args
trainer = Trainer(
    model=model, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

preds = trainer.predict(sub_ds)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Post Process

In [8]:
#helper
#note need to update softmax
import numpy as np
import pandas as pd
from scipy.special import softmax
def parse_predictions(predictions, id2label, ds, threshold=0.9):
    
    pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:,:,:12].argmax(-1)
    O_preds = pred_softmax[:,:,12]
    preds_final = np.where(O_preds < threshold, preds_without_O , preds)

    triplets = set()
    row, document, token, label, token_str = [], [], [], [], []
    for i, (p, token_map, offsets, tokens, doc, indices) in enumerate(zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"], ds["token_indices"])):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            #CHECK
            token_id = token_map[start_idx] #token ID at the start of the index
#             original_token_id = token_map[start_idx]
#             token_id = indices[original_token_id]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    row.append(i)
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.add(triplet)

    df = pd.DataFrame({
        "eval_row": row,
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    df = df.drop_duplicates().reset_index(drop=True)

    df["row_id"] = list(range(len(df)))
    return df

In [9]:
id2label.keys()

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])

In [10]:
#CHECK why preds.predictions
preds_df = parse_predictions(preds.predictions, id2label, sub_ds, threshold=THRESHOLD)

#look at to see
display(preds_df.head(5))

Unnamed: 0,eval_row,document,token,label,token_str,row_id
0,0,7,60,B-ID_NUM,Dessine,0
1,1,10,356,B-NAME_STUDENT,…,1
2,2,16,5,B-ID_NUM,Gamboa,2
3,2,16,527,B-ID_NUM,’s,3
4,3,20,5,B-ID_NUM,Sindy,4


In [11]:
preds_df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

# Test other F5 validation checks
- move helper functions to seperate script

In [12]:
import pandas as pd

def determine_metrics_preds_gt(preds_final, gt_ds, id2label, beta=5):
    """
    Create a DataFrame of submission information.

    Parameters:
    - preds_final (list): List of predictions.
    - id2label (dict): Dictionary mapping label IDs to labels.
    - gt_ds (Dataset): Dataset containing the token maps, offset mappings, tokens, gt_labels, and documents.

    Returns:
    - DataFrame: DataFrame containing the submission information.
    """
    # Create lists of submission information
    triplets = []
    document, token, p_label, gt_label, token_str, compare = [], [], [], [], [], []

    for p, gt_labels, token_map, offsets, tokens, doc in zip(preds_final, gt_ds["labels"], gt_ds["token_map"], gt_ds["offset_mapping"], gt_ds["tokens"], gt_ds["document"]):
        # Iterate through each label and its offset
        for label_pred, label_gt, (start_idx, end_idx) in zip(p, gt_labels, offsets):
            label_pred = id2label[str(label_pred)]  # Predicted label

            if start_idx + end_idx == 0: continue   # For special token or padding token

            if token_map[start_idx] == -1:  # Label is for whitespace so go to next
                start_idx += 1

            # Ignore leading whitespace token "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1
            
            # Break if index exceeds the length of token mapping
            if start_idx >= len(token_map): break
            
            token_id = token_map[start_idx]  # Token ID at start of index

            # Ignore "O" labels and whitespace labels
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])
                
                # If the ground truth label is missing, it's a false positive
                if label_gt == "O" and label_pred !="O": match = "FP"
                    
                # If the predicted label is missing or doesn't match the ground truth, it's a false negative
                elif (label_pred is None) or (label_gt != label_pred) : match = "FN"
    
                # If the ground truth label is missing, it's a false positive
                elif label_gt == "O" and label_pred !="O": match = "FP"

                # If the predicted label matches the ground truth, it's a true positive
                elif label_gt == label_pred : match = "TP"
                    
                else: match = "?"

                # Add triplet if not in list of triplets
                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    p_label.append(label_pred)
                    gt_label.append(label_gt)
                    token_str.append(tokens[token_id])
                    compare.append(match)
                    triplets.append(triplet)

    # Create a DataFrame of submission information
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "pred_label": p_label,
        "gt_label": gt_label,
        "token_str": token_str,
        "compare": compare
    })
    
    # Count the number of false positives, false negatives, and true positives
    FP = (df['compare'] == "FP").sum()
    FN = (df['compare'] == "FN").sum()
    TP = (df['compare'] == "TP").sum()
    
    # Calculate the precision, recall, and F-beta score
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    fbeta_mircro_score = (1 + (beta**2)) * precision * recall / (((beta**2) * precision) + recall) if precision + recall > 0 else 0
    
    # Print the precision, recall, and F-beta score
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-beta score: {fbeta_mircro_score}")
    
    return df

In [13]:
#test Metrics
def pii_metrics_score(pred_df, gt_df, beta=5):
    """
    Calculate print the Precision, Recall and Micro F-beta score for predicted PII labels. Determines which were false negatives

    Parameters:
    - pred_df (DataFrame): DataFrame containing predicted PII labels ["row_id", "document", "token", "label"].
    - gt_df (DataFrame): DataFrame containing ground truth PII labels ["row_id", "document", "token", "label"].
    - beta (float): The beta parameter for the F-beta score, controlling the trade-off between precision and recall.

    Returns:
    - results (dict): Dictionary containing the precision, recall, and F-beta score.
    """   
    # Merge the predicted and ground truth DataFrames on 'document' and 'token' columns
    df = pred_df.merge(gt_df, how='outer', on=['document', "token"], suffixes=('_pred', '_gt'))

    # Initialize a new column 'compare' with empty strings
    df['compare'] = ""

    # If the predicted label is missing or doesn't match the ground truth, it's a false negative
    df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred), 'compare'] = "FN"
    
    # If the ground truth label is missing, it's a false positive
    df.loc[df.label_gt.isna(), 'compare'] = "FP"

    # If the predicted label matches the ground truth, it's a true positive
    df.loc[(df.label_pred.notna()) & (df.label_gt == df.label_pred), 'compare'] = "TP"
    
    # Count the number of false positives, false negatives, and true positives
    FP = (df['compare'] == "FP").sum()
    FN = (df['compare'] == "FN").sum()
    TP = (df['compare'] == "TP").sum()


    # Calculate the precision, recall, and F-beta score
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    fbeta_mircro_score = (1 + (beta**2)) * precision * recall / (((beta**2) * precision) + recall) if precision + recall > 0 else 0

    # Get a DataFrame of false negatives
    fn_df = df.loc[df.label_pred.isna() | (df.label_gt != df.label_pred)]
    
    # Print the precision, recall, and F-beta score
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-beta score: {fbeta_mircro_score}")
    
    return fn_df

In [14]:
print("submission sample metrics:")
pred_df = pd.read_csv('/kaggle/working/submission.csv')
gt_df= pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')
fn_df = pii_metrics_score(pred_df, gt_df, beta=5)

fn_df.head()

submission sample metrics:
Precision: 0.08333333333333333
Recall: 0.07142857142857142
F-beta score: 0.0718232044198895


Unnamed: 0,row_id_pred,document,token,label_pred,row_id_gt,label_gt,compare
0,,7,9,,0.0,B-NAME_STUDENT,FN
1,,7,10,,1.0,I-NAME_STUDENT,FN
2,0.0,7,60,B-ID_NUM,,,FP
3,,7,482,,2.0,B-NAME_STUDENT,FN
4,,7,483,,3.0,I-NAME_STUDENT,FN


In [19]:
val_data = json.load(open(VAL_PATH))
val_df = pd.DataFrame(val_data)

print(val_df.columns)

val_df['token_indices'] = val_df['tokens'].apply(add_token_indices)

ds = Dataset.from_pandas(val_df)
val_gt_ds = ds.map(
        infer_tokenize,
        fn_kwargs={"tokenizer": tokenizer,}, 
        num_proc=3
    )

val_ds = val_gt_ds.remove_columns('labels')

preds = trainer.predict(sub_ds)

predictions = preds.predictions
threshold =THRESHOLD
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

print("validation sample metrics:")

val_compare_df= determine_metrics_preds_gt(preds_final, val_gt_ds, id2label)

Index(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels',
       'token_indices'],
      dtype='object')


Map (num_proc=3):   0%|          | 0/688 [00:00<?, ? examples/s]

validation sample metrics:
Precision: 0.0005970149253731343
Recall: 0.3333333333333333
F-beta score: 0.014857142857142857


In [20]:
# Count the number of false positives, false negatives, and true positives
print("FP: ", (val_compare_df['compare'] == "FP").sum()) 
print("FN: ", (val_compare_df['compare'] == "FN").sum()) 
print("TP: ", (val_compare_df['compare'] == "TP").sum())
print("?: ", (val_compare_df['compare'] == "?").sum())

FP:  1674
FN:  2
TP:  1
?:  0


In [None]:
val_compare_df[val_compare_df['compare'] == "FP"].head()