# Import dataset

In [57]:
# source https://huggingface.co/course/chapter7/2?fw=tf
import datasets
from datasets import load_dataset

classes = ["O", "Quantity", "UnitPriceAmount", "GoodsDescription",
            "Incoterms", "GoodsOrigin", "Tolerance", "HSCode"]

dataset = load_dataset("json", data_files={'train':'data/dataset_bert_train.json', 'test':'data/dataset_bert_test.json', 'validation':'data/dataset_bert_validation.json'}, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "tags": datasets.Sequence(datasets.features.ClassLabel(names=classes))
        }))
dataset

Using custom data configuration default-96ff6abefd701e5d
Found cached dataset json (/Users/claudiufilip/.cache/huggingface/datasets/json/default-96ff6abefd701e5d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 3/3 [00:00<00:00, 1086.14it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 1162
    })
    test: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
    validation: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
})

## Example

In [58]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = classes[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)


CFR       KARACHI   SEAPORT   QTY [QUANTITY] TONS     SELF             ADHESIVE         STICKER          PAPER            SIZE [INCOTERMS] X [INCOTERMS] INCHES AT USD             [QUANTITY]      USD             PER              TON. AS PER              BENEFICIARY'S PROFORMA INVOICE NO. JSDDD12112A DATED [INCOTERMS] 
Incoterms Incoterms Incoterms O   Quantity   Quantity GoodsDescription GoodsDescription GoodsDescription GoodsDescription O    O           O O           O      O  UnitPriceAmount UnitPriceAmount UnitPriceAmount GoodsDescription O    O  GoodsDescription O             O        O       O   O           O     O           


In [59]:
print(dataset["train"][0]["tokens"])

['CFR', 'KARACHI', 'SEAPORT', 'QTY', '[QUANTITY]', 'TONS', 'SELF', 'ADHESIVE', 'STICKER', 'PAPER', 'SIZE', '[INCOTERMS]', 'X', '[INCOTERMS]', 'INCHES', 'AT', 'USD', '[QUANTITY]', 'USD', 'PER', 'TON.', 'AS', 'PER', "BENEFICIARY'S", 'PROFORMA', 'INVOICE', 'NO.', 'JSDDD12112A', 'DATED', '[INCOTERMS]']


# Load Tokenizer

In [60]:
# LOAD TOKENIZER
from transformers import PreTrainedTokenizerFast, BertTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer/tokenizer.json",
    bos_token="[S]",
    eos_token="[/S]",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    padding_side="right",
    model_max_len=512
)

inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

['[CLS]', 'cfr', 'karachi', 'seaport', 'qty', '[QUANTITY]', 'tons', 'self', 'adhesive', 'sticker', 'paper', 'size', '[INCOTERMS]', 'x', '[INCOTERMS]', 'inches', 'at', 'usd', '[QUANTITY]', 'usd', 'per', 'ton.', 'as', 'per', "beneficiary's", 'proforma', 'invoice', 'no.', 'jsddd12112a', 'dated', '[INCOTERMS]', '[SEP]']


In [61]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 None]

# Aling Tokens with Values

In [62]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [63]:
labels = dataset["train"][0]["tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]
[-100, 4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -100]


In [64]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True,
        max_length=300,
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [65]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Loading cached processed dataset at /Users/claudiufilip/.cache/huggingface/datasets/json/default-96ff6abefd701e5d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-1b7537ab5ae824c1.arrow
Loading cached processed dataset at /Users/claudiufilip/.cache/huggingface/datasets/json/default-96ff6abefd701e5d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-4a5fafb86c9abd18.arrow
  0%|          | 0/1 [00:00<?, ?ba/s]


## Add Padding

In [66]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf", padding='max_length', max_length=300
)

In [67]:
tokenized_datasets['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [68]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='', vocab_size=14195, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[S]', 'eos_token': '[/S]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding='max_length', max_length=300, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='tf')

In [69]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<tf.Tensor: shape=(2, 300), dtype=int64, numpy=
array([[-100,    4,    4,    4,    0,    1,    1,    3,    3,    3,    3,
           0,    0,    0,    0,    0,    0,    2,    2,    2,    3,    0,
           0,    3,    0,    0,    0,    0,    0,    0,    0, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

In [70]:
features = tokenized_datasets['train'].features
# label_name = "label" if "label" in features[0].keys() else "labels"
# labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
features.keys()
# tokenizer.pad(tokenized_datasets['train'].features, padding=tokenizer.padding_side)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [71]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 4, 4, 4, 4, 4, 0, 1, 2, 1, 3, 3, 3, 3, 0, 2, 2, 2, 2, 2, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [72]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [73]:
id2label = {i: label for i, label in enumerate(classes)}
label2id = {v: k for k, v in id2label.items()}

# Train

In [74]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Downloading: 100%|██████████| 536M/536M [00:12<00:00, 41.3MB/s]
All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
model.summary()

Model: "tf_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6152      
                                                                 
Total params: 108,897,800
Trainable params: 108,897,800
Non-trainable params: 0
_________________________________________________________________
