### V2: Fine-Tune DistilBERT 
- Train Data
- Stride 8

Trained on local machine with RTX 3050 x1

Leaderboard 
- Public Score: 0.85732
- Private Score: 0.86914

In [1]:
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from functools import partial
from seqeval.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from tqdm import tqdm

In [2]:
class Setting:
    seed = 42
    
    # data
    data = './data/pii-detection-removal-from-educational-data/train.json'
    
    # model
    model_checkpoint = "./model/distilbert/distilbert-base-uncased"
    model_train = './model/v2_8/train'
    model_final = './model/v2_8/final'
    max_length = 512
    stride = 8
    
    # hyperparameter
    epochs = 5
    learning_rate = 3e-5
    warmup_ratio = 0.1
    lr_scheduler_type='cosine'
    weight_decay = 0.01
    grad_steps = 2
    batch_size = 8
    
    # PII (NER) tags
    labels = ["B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
              "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME",
              "I-ID_NUM", "I-NAME_STUDENT", "I-PHONE_NUM",
              "I-STREET_ADDRESS","I-URL_PERSONAL","O"]
    id2label = dict(enumerate(labels)) # integer label to BIO format label mapping
    label2id = {v:k for k,v in id2label.items()} # BIO format label to integer label mapping
    num_labels = len(labels) # number of PII (NER) tags

In [3]:
np.random.seed(Setting.seed)
torch.manual_seed(Setting.seed)

<torch._C.Generator at 0x19171e21cb0>

In [4]:
df = pd.read_json(Setting.data)
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [5]:
df = df[['document', 'tokens', 'labels']]
df.rename(columns={"labels": "pii_labels"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6807 entries, 0 to 6806
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   document    6807 non-null   int64 
 1   tokens      6807 non-null   object
 2   pii_labels  6807 non-null   object
dtypes: int64(1), object(2)
memory usage: 159.7+ KB


In [6]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['document', 'tokens', 'pii_labels'],
    num_rows: 6807
})

In [7]:
ds = ds.train_test_split(test_size=0.25, seed=Setting.seed)
ds

DatasetDict({
    train: Dataset({
        features: ['document', 'tokens', 'pii_labels'],
        num_rows: 5105
    })
    test: Dataset({
        features: ['document', 'tokens', 'pii_labels'],
        num_rows: 1702
    })
})

In [8]:
def tokenize_and_align_labels(example, tokenizer, label2id, max_length, stride):
    tokenized_inputs = tokenizer(example["tokens"], 
                                 truncation=True,
                                 max_length=max_length,                                  
                                 stride=stride, 
                                 padding="max_length", 
                                 is_split_into_words=True,
                                 return_overflowing_tokens=True)
    tokenized_overflow_mappings = tokenized_inputs.overflow_to_sample_mapping
    labels = []
    #print(len(example["tokens"]), len(tokenized_overflow_mappings))
    for idx, overflow_mapping_id in enumerate(tokenized_overflow_mappings):
        label_ids = []
        previous_word_idx = None
        word_ids = tokenized_inputs.word_ids(batch_index=idx) # map tokens to their respective words
        label = example["pii_labels"]
        # using the overflow_mapping_id
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenizer = AutoTokenizer.from_pretrained(Setting.model_checkpoint)

tokenized_ds = ds.map(tokenize_and_align_labels, 
                      fn_kwargs={"tokenizer": tokenizer, 
                                 "label2id": Setting.label2id, 
                                 "max_length": Setting.max_length,
                                 "stride": Setting.stride
                                }, num_proc=4)
tokenized_ds = tokenized_ds.remove_columns(['document', 'tokens', 'pii_labels', 'overflow_to_sample_mapping'])
tokenized_ds

Map (num_proc=4):   0%|          | 0/5105 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1702 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5105
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1702
    })
})

In [10]:
def rearrange_tokenized_input(dataset):
    tokenized = {   
        "input_ids": [],
        "labels": [],
        "attention_mask": [],       
    }
    for i in tqdm(range(len(dataset))):
        doc = dataset[i]
        for j in range(len(doc["input_ids"])):
            tokenized["input_ids"].append(doc["input_ids"][j])
            tokenized["labels"].append(doc["labels"][j])
            tokenized["attention_mask"].append(doc["attention_mask"][j])
    return tokenized

In [11]:
# rearrange tokenized data to model input format
tokenized_ds["train"] = Dataset.from_dict(rearrange_tokenized_input(tokenized_ds["train"]))
tokenized_ds["test"] = Dataset.from_dict(rearrange_tokenized_input(tokenized_ds["test"]))
tokenized_ds

100%|████████████████████████████████████████████████████████████████████████████████████| 5105/5105 [00:07<00:00, 723.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1702/1702 [00:02<00:00, 764.97it/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9904
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3314
    })
})

In [12]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    report = classification_report(y_true=true_labels, y_pred=true_predictions, output_dict=True)
    micro_avg = report.pop("micro avg")
    accuracy = accuracy_score(y_true=true_labels, y_pred=true_predictions)
    
    return {
        "precision": micro_avg["precision"],
        "recall": micro_avg["recall"],
        "f1": micro_avg["f1-score"],
        "accuracy": accuracy
    }

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(
    Setting.model_checkpoint, 
    num_labels=Setting.num_labels, 
    id2label=Setting.id2label, 
    label2id=Setting.label2id
)

training_args = TrainingArguments(
    output_dir=Setting.model_train,
    num_train_epochs=Setting.epochs,
    learning_rate=Setting.learning_rate,
    lr_scheduler_type=Setting.lr_scheduler_type,
    warmup_ratio=Setting.warmup_ratio,
    weight_decay=Setting.weight_decay,
    gradient_accumulation_steps=Setting.grad_steps,
    per_device_train_batch_size=Setting.batch_size,
    seed=Setting.seed,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, id2label=Setting.id2label)
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at ./model/distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()
trainer.state.best_model_checkpoint
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2088,0.001879,0.439453,0.472689,0.455466,0.99958
2,0.0012,0.00115,0.778157,0.478992,0.592978,0.999712
3,0.0006,0.000973,0.777215,0.644958,0.704937,0.999767
4,0.0004,0.000966,0.771689,0.710084,0.739606,0.999788
5,0.0002,0.000998,0.75737,0.701681,0.728462,0.999783


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./model/v2_8/final\\tokenizer_config.json',
 './model/v2_8/final\\special_tokens_map.json',
 './model/v2_8/final\\vocab.txt',
 './model/v2_8/final\\added_tokens.json',
 './model/v2_8/final\\tokenizer.json')

In [15]:
trainer.train()
trainer.state.best_model_checkpoint
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0003,0.001199,0.636197,0.731092,0.680352,0.999683
2,0.0002,0.001093,0.740492,0.695378,0.717226,0.999773
3,0.0002,0.000902,0.833333,0.72479,0.775281,0.999815
4,0.0001,0.000955,0.831409,0.756303,0.792079,0.99982
5,0.0,0.000979,0.82783,0.737395,0.78,0.999814


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./model/v2_8/final\\tokenizer_config.json',
 './model/v2_8/final\\special_tokens_map.json',
 './model/v2_8/final\\vocab.txt',
 './model/v2_8/final\\added_tokens.json',
 './model/v2_8/final\\tokenizer.json')

In [17]:
print(trainer.state.best_model_checkpoint)

./model/v2_8/train\checkpoint-2476


In [18]:
trainer.train()
print(trainer.state.best_model_checkpoint)
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0001,0.001064,0.8075,0.678571,0.737443,0.999781
2,0.0001,0.001008,0.748428,0.75,0.749213,0.999777
3,0.0001,0.001004,0.804651,0.726891,0.763797,0.999797
4,0.0,0.0011,0.849515,0.735294,0.788288,0.999816
5,0.0,0.001091,0.836879,0.743697,0.787542,0.99981


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


./model/v2_8/train\checkpoint-2476


('./model/v2_8/final\\tokenizer_config.json',
 './model/v2_8/final\\special_tokens_map.json',
 './model/v2_8/final\\vocab.txt',
 './model/v2_8/final\\added_tokens.json',
 './model/v2_8/final\\tokenizer.json')