In [1]:
#Kaggle notes:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#kaggle file organization
#/kaggle/input/deberta_v3/keras/deberta_v3_base_en/2
#/kaggle/input/pii-detection-removal-from-educational-data
#/kaggle/working/
#/kaggle/temp/


If not running on Kaggle, will need to install Kaggle API with !pip install kaggle

In [2]:
from pathlib import Path
import os

# set global flags
IS_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [3]:
from pathlib import Path
import os

def download_data():
    """
    Downloads the required dataset based on the environment.
    
    If running on Kaggle, the dataset is downloaded from the input folder.
    If running locally, the dataset is downloaded from Kaggle competition.
    Reference: https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster
    """
    if IS_KAGGLE:
        path = Path('../input/pii-detection-removal-from-educational-data')
#        ! pip install -q []
    else:
        import zipfile, kaggle
        datadir_path = Path('../data/external')
        competition_data_str = 'pii-detection-removal-from-educational-data'
        path = datadir_path/Path(competition_data_str)
        if not os.path.exists(path):
            #download the dataset from kaggle given competition name
            kaggle.api.competition_download_files(competition_data_str, path=datadir_path)
            #move the zip file to the data directory
            #os.rename(f'{competition_data_str}.zip', path.with_suffix('.zip'))
            zipfile.ZipFile(f'{path}.zip').extractall(path)

    return path

In [4]:
#Config, import and download data

#Get Data and print files in path from download data
DATA_PATH = download_data()
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Inference model         
if IS_KAGGLE:       
    INFERENCE_MODEL_PATH = "/kaggle/input/deberta_v3/keras/deberta_v3_base_en/2"   
    OUTPUT_DIR = 'output'  # your output path
else:
    INFERENCE_MODEL_PATH = "microsoft/deberta-v3-base"
    OUTPUT_DIR = 'output'  # your output path
    
INFERENCE_MAX_LENGTH = 2048



..\data\external\pii-detection-removal-from-educational-data\sample_submission.csv
..\data\external\pii-detection-removal-from-educational-data\test.json
..\data\external\pii-detection-removal-from-educational-data\train.json


In [5]:
#suppress warnings for notebook remove to Trouble shoot
import warnings
#warnings.filterwarnings('ignore')

In [6]:
#imports
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

# Load pretrained model


In [7]:
def infer_tokenize(example, tokenizer):
    # TODO why this set up?
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

In [8]:

#Instantiate (object what is an instance of the class) tokenizer associated with pretrained model
tokenizer = AutoTokenizer.from_pretrained(INFERENCE_MODEL_PATH)

#Instantiate model for classification task
model = AutoModelForTokenClassification.from_pretrained(INFERENCE_MODEL_PATH)

#TODO if local review symlinks and caching for warning message is letting you know that the caching system will still work,
# but it might require more disk space because it can't use symlinks



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Data Loading

In [9]:
#load json file to dictionary
data = json.load(open(DATA_PATH/'test.json'))

#Create a dataset from dictionary 
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

#Tokenize the dataset. Apply tokenize function to each element of ds
#pass specific tokenizer to tokenize function
#num_proc is the number of processes to use. Select based on CPU cores
#output is a new dataset with tokenized text
ds = ds.map(infer_tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

NameError: name 'INFERENCE_MAX_LENGTH' is not defined

# Set up Trainer API
- defines training loop for NER pipeline
- handles training, validation and evalutiaon 

References:
- https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/
- https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference?scriptVersionId=161126788

In [None]:
#Collect samples into batches for training or evaluation.
#padding to make each batch the same length. 8 or 16 can be more efficient for GPU use
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

#TODO review args for API
#TrainingArguments is a class that contains all the attributes needed to initiate a training.
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)

#initialize the trainer object
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

# Predicitions and processing

In [None]:
# from  /valentinwerner/915-deberta3base-inference?scriptVersionId=161126788
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]

threshold = 0.9
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

In [None]:
# from https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference?scriptVersionId=161126788

triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# Submission

df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

df["row_id"] = list(range(len(df)))
display(df.head(20))

df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)