In [2]:
from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [4]:
ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

### Tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def token_and_align_labels(examples):
    tokenized_inputs = []
    labels = []
    for tokens, tags in zip(examples['Tokens'], examples['Tags']):
        bert_tokens = []
        bert_tags = []
        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)

        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)

        tokenized_inputs.append(bert_ids)
        labels.append(bert_tags)

    return {
        'input_ids': tokenized_inputs,
        'labels': labels
    }

In [7]:
preprocessed_ds = ds.map(token_and_align_labels, batched=True)

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [8]:
preprocessed_ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1'],
 'input_ids': [2021, 1996, 3095, 2001, 2061, 9202, 2000, 2149, 1012],
 'labels': [0, 0, 1, 0, 0, 0, 0, 0, 0]}

In [9]:
len(tokenizer)

30522

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### Evaluate

In [11]:
!pip install -q seqeval==1.2.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [12]:
id2label = {
    0: '0',
    1: 'B-Term',
    2: 'I-Term'
}
label2id = {
    '0': 0,
    'B-Term': 1,
    'I-Term': 2
}

In [13]:
import numpy as np
from seqeval.metrics import accuracy_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2label[int(p)] for (p, l) in zip(prediction, label) if int(l) != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[int(l)] for (p, l) in zip(prediction, label) if int(l) != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = f1_score(true_predictions, true_labels)
    return {"F1-score": results}

### Model

In [14]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [15]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='abte-restaurants-distilbert-base-uncased',
    logging_dir = 'logs',
    learning_rate = 2e-5,
    per_device_train_batch_size = 256,
    per_device_eval_batch_size = 256,
    num_train_epochs = 100,
    weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'F1-score',
    save_total_limit = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds['train'],
    eval_dataset=preprocessed_ds['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1-score
1,0.6806,0.571775,0.0
2,0.4023,0.320112,0.552487
3,0.2453,0.266782,0.643237
4,0.185,0.245055,0.691131
5,0.149,0.223996,0.72144
6,0.1091,0.211193,0.7585
7,0.0817,0.206261,0.778502
8,0.0658,0.209906,0.78923
9,0.0506,0.221575,0.786795
10,0.0421,0.222839,0.79683




TrainOutput(global_step=1500, training_loss=0.025967598711450896, metrics={'train_runtime': 3023.7361, 'train_samples_per_second': 119.124, 'train_steps_per_second': 0.496, 'total_flos': 8188381977768000.0, 'train_loss': 0.025967598711450896, 'epoch': 100.0})

### Inference

In [19]:
from transformers import pipeline

model_path = 'abte-restaurants-distilbert-base-uncased/checkpoint-344'
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

token_classifier = pipeline(
    task='token-classification',
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy='simple',
    device=0
)

Device set to use cuda:0


In [20]:
test_sentence = 'The service at the restaurant was excellent but the food was mediocre.'
results = token_classifier(test_sentence)
results

[{'entity_group': '0',
  'score': 0.9972197,
  'word': 'the',
  'start': 0,
  'end': 3},
 {'entity_group': 'Term',
  'score': 0.9883996,
  'word': 'service',
  'start': 4,
  'end': 11},
 {'entity_group': '0',
  'score': 0.96973133,
  'word': 'at the',
  'start': 12,
  'end': 18},
 {'entity_group': 'Term',
  'score': 0.5198199,
  'word': 'restaurant',
  'start': 19,
  'end': 29},
 {'entity_group': '0',
  'score': 0.9977423,
  'word': 'was excellent but the',
  'start': 30,
  'end': 51},
 {'entity_group': 'Term',
  'score': 0.9926347,
  'word': 'food',
  'start': 52,
  'end': 56},
 {'entity_group': '0',
  'score': 0.95703,
  'word': 'was mediocre.',
  'start': 57,
  'end': 70}]