# Import dataset

In [1]:
# source https://huggingface.co/course/chapter7/2?fw=tf
import datasets
from datasets import load_dataset

classes = ["O", "Quantity", "UnitPriceAmount", "GoodsDescription",
            "Incoterms", "GoodsOrigin", "Tolerance", "HSCode"]

dataset = load_dataset("json", data_files={'train':'data/dataset_bert_train.json', 'test':'data/dataset_bert_test.json', 'validation':'data/dataset_bert_validation.json'}, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "tags": datasets.Sequence(datasets.features.ClassLabel(names=classes))
        }))
dataset

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-64b88a506175b741


Downloading and preparing dataset json/default to /home/azureuser/.cache/huggingface/datasets/json/default-64b88a506175b741/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 6168.09it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1370.24it/s]
                                

Dataset json downloaded and prepared to /home/azureuser/.cache/huggingface/datasets/json/default-64b88a506175b741/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 677.70it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 1162
    })
    test: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
    validation: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
})

## Example

In [2]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = classes[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)


CFR       KARACHI   SEAPORT   QTY [QUANTITY] TONS     SELF             ADHESIVE         STICKER          PAPER            SIZE [INCOTERMS] X [INCOTERMS] INCHES AT USD             [QUANTITY]      USD             PER              TON. AS PER              BENEFICIARY'S PROFORMA INVOICE NO. JSDDD12112A DATED [INCOTERMS] 
Incoterms Incoterms Incoterms O   Quantity   Quantity GoodsDescription GoodsDescription GoodsDescription GoodsDescription O    O           O O           O      O  UnitPriceAmount UnitPriceAmount UnitPriceAmount GoodsDescription O    O  GoodsDescription O             O        O       O   O           O     O           


In [3]:
print(dataset["train"][0]["tokens"])

['CFR', 'KARACHI', 'SEAPORT', 'QTY', '[QUANTITY]', 'TONS', 'SELF', 'ADHESIVE', 'STICKER', 'PAPER', 'SIZE', '[INCOTERMS]', 'X', '[INCOTERMS]', 'INCHES', 'AT', 'USD', '[QUANTITY]', 'USD', 'PER', 'TON.', 'AS', 'PER', "BENEFICIARY'S", 'PROFORMA', 'INVOICE', 'NO.', 'JSDDD12112A', 'DATED', '[INCOTERMS]']


# Load Tokenizer

In [4]:
# LOAD TOKENIZER
from transformers import PreTrainedTokenizerFast, BertTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer/tokenizer.json",
    bos_token="[S]",
    eos_token="[/S]",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    padding_side="right",
    model_max_len=512
)

inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

2022-11-14 14:42:11.774810: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-14 14:42:11.932235: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-14 14:42:11.932261: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-14 14:42:11.966317: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-14 14:42:12.737581: W tensorflow/stream_executor/platform/de

['[CLS]', 'cfr', 'karachi', 'seaport', 'qty', '[QUANTITY]', 'tons', 'self', 'adhesive', 'sticker', 'paper', 'size', '[INCOTERMS]', 'x', '[INCOTERMS]', 'inches', 'at', 'usd', '[QUANTITY]', 'usd', 'per', 'ton.', 'as', 'per', "beneficiary's", 'proforma', 'invoice', 'no.', 'jsddd12112a', 'dated', '[INCOTERMS]', '[SEP]']


In [6]:
print(inputs.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, None]


# Aling Tokens with Values

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [8]:
labels = dataset["train"][0]["tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]
[-100, 4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -100]


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True,
        max_length=300,
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [10]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

 50%|█████     | 1/2 [00:00<00:00,  5.13ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]


## Add Padding

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf", padding='max_length', max_length=300
)

In [12]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
2022-11-14 14:42:36.343220: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-14 14:42:36.343256: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-14 14:42:36.343280: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (TudorMLTest1): /proc/driver/nvidia/version does not exist
2022-11-14 14:42:36.343633: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-c

<tf.Tensor: shape=(2, 300), dtype=int64, numpy=
array([[-100,    4,    4,    4,    0,    1,    1,    3,    3,    3,    3,
           0,    0,    0,    0,    0,    0,    2,    2,    2,    3,    0,
           0,    3,    0,    0,    0,    0,    0,    0,    0, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

In [13]:
features = tokenized_datasets['train'].features
# label_name = "label" if "label" in features[0].keys() else "labels"
# labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
features.keys()
# tokenizer.pad(tokenized_datasets['train'].features, padding=tokenizer.padding_side)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [14]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 4, 4, 4, 4, 4, 0, 1, 2, 1, 3, 3, 3, 3, 0, 2, 2, 2, 2, 2, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [15]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

  tensor = as_tensor(value)


In [16]:
id2label = {i: label for i, label in enumerate(classes)}
label2id = {v: k for k, v in id2label.items()}

# Train

In [17]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Downloading: 100%|██████████| 570/570 [00:00<00:00, 365kB/s]
Downloading: 100%|██████████| 536M/536M [00:08<00:00, 63.0MB/s] 
All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
model.summary()

Model: "tf_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6152      
                                                                 
Total params: 108,897,800
Trainable params: 108,897,800
Non-trainable params: 0
_________________________________________________________________


In [20]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 5
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [21]:
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/5


  tensor = as_tensor(value)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f32b6422430>

In [29]:
model.save_weights('/proiecte/TRAF2-11215/bert/model/model', save_format='tf')

In [33]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 3.35MB/s]


In [37]:
# EXAMPLE

labels = dataset["train"][0]["tags"]
labels = [classes[i] for i in labels]

predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ncoterms': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'nitPriceAmount': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'oodsDescription': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'uantity': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.8333333333333334,
 'overall_recall': 0.8333333333333334,
 'overall_f1': 0.8333333333333334,
 'overall_accuracy': 0.9666666666666667}

In [38]:
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(classes[predicted_idx])
            all_labels.append(classes[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

  tensor = as_tensor(value)
  _warn_prf(average, modifier, msg_start, len(result))


{'SCode': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3},
 'ncoterms': {'precision': 0.8032128514056225,
  'recall': 0.5633802816901409,
  'f1': 0.6622516556291391,
  'number': 355},
 'nitPriceAmount': {'precision': 0.6652542372881356,
  'recall': 0.6168958742632613,
  'f1': 0.6401630988786952,
  'number': 509},
 'olerance': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44},
 'oodsDescription': {'precision': 0.28450704225352114,
  'recall': 0.2229580573951435,
  'f1': 0.25,
  'number': 453},
 'oodsOrigin': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8},
 'uantity': {'precision': 0.3968253968253968,
  'recall': 0.2853881278538813,
  'f1': 0.33200531208499334,
  'number': 438},
 'overall_precision': 0.5319913731128685,
 'overall_recall': 0.4088397790055249,
 'overall_f1': 0.46235551390190566,
 'overall_accuracy': 0.8422386250963975}