# **BERT PARA CLASIFICACIÓN DE TOKENS**

In [None]:
#Montar Colab en Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification, DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
from datasets import load_metric, Dataset

In [None]:
# Usamos un tokenizador cased que para esta tarea puede ser necesario
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [None]:
snips_file = open('/content/drive/MyDrive/snips.train.txt', 'rb')
snips_rows = snips_file.readlines()

snips_rows[:20]

In [None]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # saltarse las filas sin datos
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [None]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'Hay {len(unique_token_labels)} etiquetas de token únicas')

Hay 72 etiquetas de token únicas


In [None]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances,
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [None]:
snips_dataset['train'][0]

{'utterance': 'play music year 2016 by artist michiru yamane',
 'label': 'PlayMusic',
 'tokens': ['play',
  'music',
  'year',
  '2016',
  'by',
  'artist',
  'michiru',
  'yamane'],
 'token_labels': [0, 0, 0, 31, 0, 0, 62, 34]}

In [None]:
# Es posible que las "token_labels" dados no coincidan con la tokenización de  BERT, por lo que
# esta función los asignará a la tokenización que usa BERT
# -100 es un reservado para etiquetas en las que no queremos calcular pérdidas
# tratando de predecir tokens como CLS o SEP o de palabras divididas en subtokens

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Asigna tokens a su palabra respectiva.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Configura los tokens especiales a -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Solo etiqueta el primer token de una palabra sin son varios.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # CLS y SEP son etiquetados como -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# asigna nuestro dataset  para ser de clasificación de tokens
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

In [None]:
tok_clf_tokenized_snips['train'][0]

{'utterance': 'play music year 2016 by artist michiru yamane',
 'label': 'PlayMusic',
 'tokens': ['play',
  'music',
  'year',
  '2016',
  'by',
  'artist',
  'michiru',
  'yamane'],
 'token_labels': [0, 0, 0, 31, 0, 0, 62, 34],
 'input_ids': [101,
  1505,
  1390,
  1214,
  1446,
  1118,
  2360,
  1940,
  4313,
  5082,
  11078,
  1399,
  1162,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 0, 31, 0, 0, 62, -100, -100, 34, -100, -100, -100]}

In [None]:
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [None]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(unique_token_labels)
)

# Set our label dictionary
tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tok_clf_model.config.id2label[0], tok_clf_model.config.id2label[1]

('O', 'B-city')

In [None]:
import os
os.makedirs("/content/drive/MyDrive/snips_clf/resultstokens", exist_ok=True)

In [None]:
epochs = 2

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/snips_clf/resultstokens",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=tok_clf_model,
    args=training_args,
    train_dataset=tok_clf_tokenized_snips['train'],
    eval_dataset=tok_clf_tokenized_snips['test'],
    data_collator=tok_data_collator
)

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 4.399007797241211,
 'eval_runtime': 118.2107,
 'eval_samples_per_second': 22.138,
 'eval_steps_per_second': 0.694}

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 65,246,280


Epoch,Training Loss,Validation Loss
1,0.1721,0.174668
2,0.1001,0.129533


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


Saving model checkpoint to /content/drive/MyDrive/snips_clf/results/checkpoint-328
Configuration saved in /content/drive/MyDrive/snips_clf/results/checkpoint-328/config.json
Model weights saved in /content/drive/MyDrive/snips_clf/results/checkpoint-328/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/snips_clf/results/checkpoint-656
Configuration saved in /content/drive/MyDrive/snips_clf/results/checkpoint-656/config.json
Model weights saved in /content/drive/MyDrive/snips_clf/results/checkpoint-656/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/snips_clf/results/checkpoint-656 (score: 0.12953303754329681).


TrainOutput(global_step=656, training_loss=0.39019599075360994, metrics={'train_runtime': 3436.725, 'train_samples_per_second': 6.091, 'train_steps_per_second': 0.191, 'total_flos': 129160239536160.0, 'train_loss': 0.39019599075360994, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.12953303754329681,
 'eval_runtime': 124.6129,
 'eval_samples_per_second': 21.001,
 'eval_steps_per_second': 0.658,
 'epoch': 2.0}

In [None]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': 0.90811116,
  'index': 3,
  'word': 'Here',
  'start': 11,
  'end': 15},
 {'entity': 'I-entity_name',
  'score': 0.8904896,
  'index': 4,
  'word': 'We',
  'start': 16,
  'end': 18},
 {'entity': 'I-entity_name',
  'score': 0.9222006,
  'index': 5,
  'word': 'Go',
  'start': 19,
  'end': 21},
 {'entity': 'B-artist',
  'score': 0.6795641,
  'index': 7,
  'word': 'Di',
  'start': 25,
  'end': 27},
 {'entity': 'I-artist',
  'score': 0.4951065,
  'index': 8,
  'word': '##sp',
  'start': 27,
  'end': 29},
 {'entity': 'I-entity_name',
  'score': 0.5134353,
  'index': 9,
  'word': '##atch',
  'start': 29,
  'end': 33},
 {'entity': 'B-playlist_owner',
  'score': 0.9935958,
  'index': 11,
  'word': 'my',
  'start': 37,
  'end': 39},
 {'entity': 'B-playlist',
  'score': 0.9923908,
  'index': 12,
  'word': 'road',
  'start': 40,
  'end': 44},
 {'entity': 'I-playlist',
  'score': 0.9944571,
  'index': 13,
  'word': 'trip',
  'start': 45,
  'end': 49}]

In [None]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Rate the doog food 5 out of 5')

[{'entity': 'B-object_name',
  'score': 0.62313217,
  'index': 2,
  'word': 'the',
  'start': 5,
  'end': 8},
 {'entity': 'I-object_name',
  'score': 0.9711656,
  'index': 3,
  'word': 'do',
  'start': 9,
  'end': 11},
 {'entity': 'I-object_name',
  'score': 0.9886669,
  'index': 4,
  'word': '##og',
  'start': 11,
  'end': 13},
 {'entity': 'I-object_name',
  'score': 0.9863257,
  'index': 5,
  'word': 'food',
  'start': 14,
  'end': 18},
 {'entity': 'B-rating_value',
  'score': 0.99513066,
  'index': 6,
  'word': '5',
  'start': 19,
  'end': 20},
 {'entity': 'B-best_rating',
  'score': 0.82789963,
  'index': 9,
  'word': '5',
  'start': 28,
  'end': 29}]