# Import dataset

In [1]:
# source https://huggingface.co/course/chapter7/2?fw=tf
import datasets
from datasets import load_dataset

classes = ["O", "Quantity", "UnitPriceAmount", "GoodsDescription",
            "Incoterms", "GoodsOrigin", "Tolerance", "HSCode"]

dataset = load_dataset("json", data_files={'train':'data/dataset_bert_train_v2.json', 'test':'data/dataset_bert_test_v2.json', 'validation':'data/dataset_bert_validation_v2.json'}, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "tags": datasets.Sequence(datasets.features.ClassLabel(names=classes))
        }))
dataset

Using custom data configuration default-e4e5f0259bd80fba
Found cached dataset json (/home/azureuser/.cache/huggingface/datasets/json/default-e4e5f0259bd80fba/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 1162
    })
    test: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
    validation: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 249
    })
})

## Example

In [2]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = classes[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)


EXW       QTY 20000.00 KGS      STEARIC          ACID             TP 800 BAGS X         25 KGS      AT PKR             190.00          PER             KG              PLUS 17 PCT SALES TAX PKR             646000.00 AS PER             BENEFICIARY'S PROFORMA INVOICE NO. NICL/CTPI/DEC/2019 DATED 10-DEC-2019 
Incoterms O   Quantity Quantity GoodsDescription GoodsDescription O  O   O    Incoterms O  Quantity O  UnitPriceAmount UnitPriceAmount UnitPriceAmount UnitPriceAmount O    O  O   O     O   UnitPriceAmount O         O  UnitPriceAmount O             O        O       O   O                  O     O           


In [3]:
print(dataset["train"][0]["tokens"])

['EXW', 'QTY', '20000.00', 'KGS', 'STEARIC', 'ACID', 'TP', '800', 'BAGS', 'X', '25', 'KGS', 'AT', 'PKR', '190.00', 'PER', 'KG', 'PLUS', '17', 'PCT', 'SALES', 'TAX', 'PKR', '646000.00', 'AS', 'PER', "BENEFICIARY'S", 'PROFORMA', 'INVOICE', 'NO.', 'NICL/CTPI/DEC/2019', 'DATED', '10-DEC-2019']


# Load Tokenizer

In [4]:
# LOAD TOKENIZER
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer/tokenizer_v2.json",
    bos_token="[S]",
    eos_token="[/S]",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    padding_side="right",
    model_max_len=300
)

inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

2022-12-16 06:48:14.214692: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-16 06:48:14.553784: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-16 06:48:14.553830: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-16 06:48:14.605237: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-16 06:48:15.926264: W tensorflow/stream_executor/platform/de

['[CLS]', 'exw', 'qty', '20000.00', 'kgs', 'stearic', 'acid', 'tp', '800', 'bags', 'x', '25', 'kgs', 'at', 'pkr', '190.00', 'per', 'kg', 'plus', '17', 'pct', 'sales', 'tax', 'pkr', '646000.00', 'as', 'per', "beneficiary's", 'proforma', 'invoice', 'no.', 'nicl/ctpi/dec/2019', 'dated', '10-dec-2019', '[SEP]']


In [5]:
print(inputs.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, None]


# Aling Tokens with Values

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]

    return new_labels

In [7]:
labels = dataset["train"][0]["tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[4, 0, 1, 1, 3, 3, 0, 0, 0, 4, 0, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]
[-100, 4, 0, 1, 1, 3, 3, 0, 0, 0, 4, 0, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, -100]


In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True,
        max_length=300,
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/json/default-e4e5f0259bd80fba/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-7ab0cd779b2b9504.arrow
Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/json/default-e4e5f0259bd80fba/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-5d190b33b06afd49.arrow
Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/json/default-e4e5f0259bd80fba/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-29755301345a1998.arrow


## Add Padding

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf", padding='max_length', max_length=300
)

In [11]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
2022-12-16 06:48:19.416709: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-16 06:48:19.416747: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-16 06:48:19.416776: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (TudorMLTest1): /proc/driver/nvidia/version does not exist
2022-12-16 06:48:19.418311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-c

<tf.Tensor: shape=(2, 300), dtype=int64, numpy=
array([[-100,    4,    0,    1,    1,    3,    3,    0,    0,    0,    4,
           0,    1,    0,    2,    2,    2,    2,    0,    0,    0,    0,
           0,    2,    0,    0,    2,    0,    0,    0,    0,    0,    0,
           0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

In [12]:
features = tokenized_datasets['train'].features
# label_name = "label" if "label" in features[0].keys() else "labels"
# labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
features.keys()
# tokenizer.pad(tokenized_datasets['train'].features, padding=tokenizer.padding_side)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [13]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 4, 0, 1, 1, 3, 3, 0, 0, 0, 4, 0, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 4, 4, 4, 4, 0, 1, 1, 3, 3, 3, 3, 0, 2, 2, 2, 2, 4, 2, 0, 0, 0, 0, 0, 0, 0, -100]


In [14]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

  tensor = as_tensor(value)


In [15]:
id2label = {i: label for i, label in enumerate(classes)}
label2id = {v: k for k, v in id2label.items()}

# Train

In [16]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
    max_length=300
)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model.summary()

Model: "tf_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6152      
                                                                 
Total params: 108,897,800
Trainable params: 108,897,800
Non-trainable params: 0
_________________________________________________________________


In [18]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 10
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [19]:
from huggingface_hub import notebook_login

notebook_login()
# token: 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [21]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="bert-ner-conpend-v4", tokenizer=tokenizer)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

/proiecte/TRAF2-11215/bert/bert-ner-conpend-v4 is already a clone of https://huggingface.co/ClaudiuFilip1100/bert-ner-conpend-v4. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/10


  tensor = as_tensor(value)


Epoch 2/10


  tensor = as_tensor(value)


Epoch 3/10


  tensor = as_tensor(value)


Epoch 4/10


  tensor = as_tensor(value)


Epoch 5/10


  tensor = as_tensor(value)


Epoch 6/10


  tensor = as_tensor(value)


Epoch 7/10


  tensor = as_tensor(value)


Epoch 8/10


  tensor = as_tensor(value)


Epoch 9/10


  tensor = as_tensor(value)


Epoch 10/10


  tensor = as_tensor(value)




In [22]:
import evaluate

metric = evaluate.load("seqeval")

In [23]:
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(classes[predicted_idx])
            all_labels.append(classes[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

  tensor = as_tensor(value)
  _warn_prf(average, modifier, msg_start, len(result))


{'SCode': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 6},
 'ncoterms': {'precision': 0.7165354330708661,
  'recall': 0.7711864406779662,
  'f1': 0.7428571428571428,
  'number': 354},
 'nitPriceAmount': {'precision': 0.7106382978723405,
  'recall': 0.6720321931589537,
  'f1': 0.6907962771458118,
  'number': 497},
 'olerance': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 19},
 'oodsDescription': {'precision': 0.4713375796178344,
  'recall': 0.3425925925925926,
  'f1': 0.3967828418230563,
  'number': 432},
 'oodsOrigin': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 15},
 'uantity': {'precision': 0.6342857142857142,
  'recall': 0.5842105263157895,
  'f1': 0.6082191780821918,
  'number': 380},
 'overall_precision': 0.6440342781806196,
 'overall_recall': 0.5736934820904287,
 'overall_f1': 0.606832298136646,
 'overall_accuracy': 0.9046435431024743}

In [24]:
model.push_to_hub('bert-ner-conpend-v4')
tokenizer.push_to_hub('bert-ner-conpend-v4')

CommitInfo(commit_url='https://huggingface.co/ClaudiuFilip1100/bert-ner-conpend-v4/commit/6fbf2c5c701e2929a13407d5265791681a191440', commit_message='Upload tokenizer', commit_description='', oid='6fbf2c5c701e2929a13407d5265791681a191440', pr_url=None, pr_revision=None, pr_num=None)