### V1: Fine-Tune DistilBERT 
- Train Data

Trained on local machine with RTX 3050 x1

Leaderboard 
- Public Score: 0.85292
- Private Score: 0.87363

In [1]:
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from functools import partial
from seqeval.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [12]:
class Setting:
    seed = 42
    
    # data
    data = './data/pii-detection-removal-from-educational-data/train.json'
    
    # model
    model_checkpoint = "./model/distilbert/distilbert-base-uncased"
    model_train = './model/v1/train'
    model_final = './model/v1/final'
    max_length = 512
    
    # hyperparameter
    epochs = 2
    learning_rate = 3e-5
    warmup_ratio = 0.1
    lr_scheduler_type='cosine'
    weight_decay = 0.01
    grad_steps = 2
    batch_size = 8
    
    # PII (NER) tags
    labels = ["B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
              "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME",
              "I-ID_NUM", "I-NAME_STUDENT", "I-PHONE_NUM",
              "I-STREET_ADDRESS","I-URL_PERSONAL","O"]
    id2label = dict(enumerate(labels)) # integer label to BIO format label mapping
    label2id = {v:k for k,v in id2label.items()} # BIO format label to integer label mapping
    num_labels = len(labels) # number of PII (NER) tags

In [3]:
np.random.seed(Setting.seed)
torch.manual_seed(Setting.seed)

<torch._C.Generator at 0x1391b221c50>

In [4]:
df = pd.read_json(Setting.data)
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [5]:
df = df[['document', 'tokens', 'labels']]
df.rename(columns={"labels": "pii_labels"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6807 entries, 0 to 6806
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   document    6807 non-null   int64 
 1   tokens      6807 non-null   object
 2   pii_labels  6807 non-null   object
dtypes: int64(1), object(2)
memory usage: 159.7+ KB


In [6]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['document', 'tokens', 'pii_labels'],
    num_rows: 6807
})

In [7]:
ds = ds.train_test_split(test_size=0.25, seed=Setting.seed)
ds

DatasetDict({
    train: Dataset({
        features: ['document', 'tokens', 'pii_labels'],
        num_rows: 5105
    })
    test: Dataset({
        features: ['document', 'tokens', 'pii_labels'],
        num_rows: 1702
    })
})

In [8]:
def tokenize_and_align_labels(examples, tokenizer, label2id, max_length):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, max_length=max_length, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["pii_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.       
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [9]:
tokenizer = AutoTokenizer.from_pretrained(Setting.model_checkpoint)

tokenized_ds = ds.map(tokenize_and_align_labels, 
                      fn_kwargs={"tokenizer": tokenizer, "label2id": Setting.label2id, "max_length": Setting.max_length},
                      batched=True)
tokenized_ds

Map:   0%|          | 0/5105 [00:00<?, ? examples/s]

Map:   0%|          | 0/1702 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'tokens', 'pii_labels', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5105
    })
    test: Dataset({
        features: ['document', 'tokens', 'pii_labels', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1702
    })
})

In [10]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    report = classification_report(y_true=true_labels, y_pred=true_predictions, output_dict=True)
    micro_avg = report.pop("micro avg")
    accuracy = accuracy_score(y_true=true_labels, y_pred=true_predictions)
    
    return {
        "precision": micro_avg["precision"],
        "recall": micro_avg["recall"],
        "f1": micro_avg["f1-score"],
        "accuracy": accuracy
    }

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(
    Setting.model_checkpoint, 
    num_labels=Setting.num_labels, 
    id2label=Setting.id2label, 
    label2id=Setting.label2id
)

training_args = TrainingArguments(
    output_dir=Setting.model_train,
    num_train_epochs=Setting.epochs,
    learning_rate=Setting.learning_rate,
    lr_scheduler_type=Setting.lr_scheduler_type,
    warmup_ratio=Setting.warmup_ratio,
    weight_decay=Setting.weight_decay,
    gradient_accumulation_steps=Setting.grad_steps,
    per_device_train_batch_size=Setting.batch_size,
    seed=Setting.seed,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, id2label=Setting.id2label)
)

trainer.train()
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at ./model/distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,No log,0.002535,0.535865,0.384848,0.447972,0.999566
1,0.101900,0.001925,0.561873,0.509091,0.534181,0.999668


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./model/final\\tokenizer_config.json',
 './model/final\\special_tokens_map.json',
 './model/final\\vocab.txt',
 './model/final\\added_tokens.json',
 './model/final\\tokenizer.json')

- accuracy is high as this dataset has imbalanced classess where non-PII label 'O' is much more than other tags
- precision, recall and F1 score is improving on 2nd epoch, let's train for two more epochs

In [13]:
# load from Setting.model_final
tokenizer = AutoTokenizer.from_pretrained(Setting.model_final)

# load from Setting.model_final
model = AutoModelForTokenClassification.from_pretrained(
    Setting.model_final, 
    num_labels=Setting.num_labels, 
    id2label=Setting.id2label, 
    label2id=Setting.label2id
)

training_args = TrainingArguments(
    output_dir=Setting.model_train,
    num_train_epochs=Setting.epochs,
    learning_rate=Setting.learning_rate,
    lr_scheduler_type=Setting.lr_scheduler_type,
    warmup_ratio=Setting.warmup_ratio,
    weight_decay=Setting.weight_decay,
    gradient_accumulation_steps=Setting.grad_steps,
    per_device_train_batch_size=Setting.batch_size,
    seed=Setting.seed,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, id2label=Setting.id2label)
)

trainer.train()
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,No log,0.000948,0.790441,0.651515,0.714286,0.999751
1,0.000900,0.000902,0.766154,0.754545,0.760305,0.999775


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./model/v1/final\\tokenizer_config.json',
 './model/v1/final\\special_tokens_map.json',
 './model/v1/final\\vocab.txt',
 './model/v1/final\\added_tokens.json',
 './model/v1/final\\tokenizer.json')

- precision, recall and F1 score is still improving, let's try train for more epochs

In [14]:
# load from Setting.model_final
tokenizer = AutoTokenizer.from_pretrained(Setting.model_final)

# load from Setting.model_final
model = AutoModelForTokenClassification.from_pretrained(
    Setting.model_final, 
    num_labels=Setting.num_labels, 
    id2label=Setting.id2label, 
    label2id=Setting.label2id
)

training_args = TrainingArguments(
    output_dir=Setting.model_train,
    num_train_epochs=5,
    learning_rate=Setting.learning_rate,
    lr_scheduler_type=Setting.lr_scheduler_type,
    warmup_ratio=Setting.warmup_ratio,
    weight_decay=Setting.weight_decay,
    gradient_accumulation_steps=Setting.grad_steps,
    per_device_train_batch_size=Setting.batch_size,
    seed=Setting.seed,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, id2label=Setting.id2label)
)

trainer.train()
trainer.save_model(Setting.model_final)
tokenizer.save_pretrained(Setting.model_final)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,No log,0.000932,0.785235,0.709091,0.745223,0.999778
2,0.000300,0.000743,0.784703,0.839394,0.811127,0.999795
4,0.000100,0.000769,0.814241,0.79697,0.805513,0.999824


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./model/v1/final\\tokenizer_config.json',
 './model/v1/final\\special_tokens_map.json',
 './model/v1/final\\vocab.txt',
 './model/v1/final\\added_tokens.json',
 './model/v1/final\\tokenizer.json')