# Setup

## Configs

In [1]:
INFERENCE_MAX_LENGTH = 2048
model_path = '/kaggle/input/3-deberta-fine-tuned-training/deberta3base_1024'

## Imports

In [2]:
import json
import pandas as pd
import numpy as np
from itertools import chain
from pathlib import Path

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset

2024-03-29 07:57:44.259821: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 07:57:44.259925: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 07:57:44.385956: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data (pre)processing

In [3]:
test_data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

## Tokenization

In [4]:
def tokenize(data, tokenizer):
    
    text, token_map = [], []
    idx = 0
    
    for tok, ws in zip(data["tokens"], data["trailing_whitespace"]):
        
        text.append(tok)
        token_map.extend([idx] * len(tok))
        
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {**tokenized, "token_map": token_map}

In [5]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in test_data],
    "document": [x["document"] for x in test_data],
    "tokens": [x["tokens"] for x in test_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in test_data],
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [7]:
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

# Modeling (based on trained model)

## Trainer class

In [8]:
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [9]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [10]:
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)

In [11]:
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Prediction post processing

In [12]:
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

In [13]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]

In [14]:
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]

In [15]:
threshold = 0.9
preds_final = np.where(O_preds < threshold, preds_without_O, preds)

In [16]:
triplets = []
document, token, label, token_str = [], [], [], []

In [17]:
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# Submission

In [None]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

In [19]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)