### V19: Fine-Tune DeBERTa-V3-Small 
- Train Data
- Down Sampling 'O' Label 0.30
- External Dataset 
- Evaluation Metric F-Beta5
- Cross Validation
- Max Length 1

Trained on GPU T4 x2

Inference
- Max Length 2048
- Stride 256

In [1]:
import gc
import json
import numpy as np
import os
import pandas as pd
import torch

from datasets import Dataset
from pathlib import Path
from scipy.special import softmax
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [2]:
class Setting:
    seed = 42
    
    # data
    data = './data/pii-detection-removal-from-educational-data/train.json'
    
    # model
    model_final = './model/v9/final'
    max_length = 2048
    stride = 256
    non_pii_label_threshold = 0.98
    
    # hyperparameter
    batch_size = 8

In [3]:
def clean_memory(): 
    gc.collect() 
    torch.cuda.empty_cache()

In [4]:
np.random.seed(Setting.seed)
torch.manual_seed(Setting.seed)

<torch._C.Generator at 0x21f5f571cd0>

In [5]:
df = pd.read_json(Setting.data)
df = df.head(1000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             1000 non-null   int64 
 1   full_text            1000 non-null   object
 2   tokens               1000 non-null   object
 3   trailing_whitespace  1000 non-null   object
 4   labels               1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [6]:
df = df[['document', 'tokens']]
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['document', 'tokens'],
    num_rows: 1000
})

In [7]:
def tokenize(example, tokenizer, max_length, stride):
    tokenized_inputs = tokenizer(example["tokens"], 
                                 truncation=True,
                                 max_length=max_length,                                  
                                 stride=stride, 
                                 #padding="max_length", 
                                 is_split_into_words=True,
                                 return_overflowing_tokens=True)
    
    tokenized_overflow_mappings = tokenized_inputs.overflow_to_sample_mapping
    token_maps = []
   
    for idx, overflow_mapping_id in enumerate(tokenized_overflow_mappings):
        token_map_ids = []
        previous_word_idx = None
        # map tokens to their respective words
        word_ids = tokenized_inputs.word_ids(batch_index=idx) 
        for word_idx in word_ids:  
            if word_idx is None:
                # set the special tokens to -1
                token_map_ids.append(-1)
            elif word_idx != previous_word_idx:
                # only label the first token of a given word
                token_map_ids.append(word_idx)
            else:
                # set other tokens to -1
                token_map_ids.append(-1)
            previous_word_idx = word_idx
        token_maps.append(token_map_ids)

    tokenized_inputs["token_maps"] = token_maps
    return tokenized_inputs

In [8]:
tokenizer = AutoTokenizer.from_pretrained(Setting.model_final)

ds = ds.map(tokenize, 
            fn_kwargs={
              "tokenizer": tokenizer, 
              "max_length": Setting.max_length, 
              "stride": Setting.stride},
            num_proc=os.cpu_count())
ds

Map (num_proc=12):   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['document', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping', 'token_maps'],
    num_rows: 1000
})

In [9]:
clean_memory()

In [10]:
model = AutoModelForTokenClassification.from_pretrained(Setting.model_final)
collator = DataCollatorForTokenClassification(tokenizer)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=Setting.batch_size, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
ds_dict = {
    "document":[],    
    "tokens":[],
    "token_maps":[],
    "input_ids":[],
    "preds":[],
}

with torch.no_grad():
    for row in ds:
        row_preds = []
        row_offset = []
        row_input_ids = []
        row_token_maps = [] 
        
        for i, input_id in enumerate(row["input_ids"]):
            # create new datasset for each of of the splits per document
            x = Dataset.from_dict({
                #"token_type_ids":[row["token_type_ids"][i]],
                "input_ids":[input_id],
                "attention_mask":[row["attention_mask"][i]]
            })        
            # predict for that split        
            pred = trainer.predict(x).predictions
            row_preds.extend(pred[0])
            row_input_ids.extend(input_id)
            row_token_maps.extend(row["token_maps"][i])
     
        # finalize row
        ds_dict["document"].append(row["document"])
        ds_dict["tokens"].append(row["tokens"])
        ds_dict["token_maps"].append(row_token_maps)
        ds_dict["input_ids"].append(row_input_ids)
        ds_dict["preds"].append(np.asarray(row_preds))

In [12]:
del ds
del trainer
clean_memory()

In [13]:
Setting.non_pii_label_threshold = 0.90

In [14]:
config = json.load(open(Path(Setting.model_final) / "config.json"))
id2label = config["id2label"]

preds_final = []
for predictions in ds_dict["preds"]:
    predicted_labels = predictions.argmax(-1)
    predictions_score = softmax(predictions, axis=1)   
    predicted_pii_labels = predictions_score[:,:12].argmax(-1)
    non_pii_labels_score = predictions_score[:,12]
    # take from predicted_pii_labels if non_pii_labels_score less than non_pii_label_threshold 
    preds_final.append(np.where(non_pii_labels_score < Setting.non_pii_label_threshold, predicted_pii_labels, predicted_labels))

In [15]:
ds = Dataset.from_dict(ds_dict)
pairs = []
document, token, label, token_str = [], [], [], []
for pred, input_id, token_map, tokens, doc in zip(preds_final, ds["input_ids"], ds["token_maps"], ds["tokens"], ds["document"]):
    token_length = max(token_map)
    for idx in range(len(pred)):
        label_pred = id2label[str(pred[idx])]
        token_id = token_map[idx]
        
        if token_map[idx] == -1 or label_pred == 'O':
            if token_id == token_length:
                break
            continue
    
        pair=(doc, token_id)
        if pair not in pairs:
            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            pairs.append(pair)
            
        if token_id == token_length:
            break

In [16]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(1000))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
...,...,...,...,...,...
995,5613,0,B-NAME_STUDENT,Amit,995
996,5613,1,I-NAME_STUDENT,Sharma,996
997,5621,564,B-NAME_STUDENT,Salman,997
998,5621,565,I-NAME_STUDENT,Kumar,998


In [17]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

In [18]:
df.to_csv("submission2.csv", index=False)

![image.png](attachment:30095b20-1253-47d4-8543-e0130d21e47f.png)