# **BERT PARA CLASIFICACIÓN DE TOKENS**

In [None]:
#Montar Colab en Drive
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install transformers
#!pip install datasets
#!pip install accelerate


In [1]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification, DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
from datasets import load_metric, Dataset

In [2]:
# Usamos un tokenizador cased que para esta tarea puede ser necesario
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [4]:
#snips_file = open('/content/drive/MyDrive/snips.train.txt', 'rb')
snips_file = open('./snips.train.txt', 'rb')
snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [5]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # saltarse las filas sin datos
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [6]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'Hay {len(unique_token_labels)} etiquetas de token únicas')

Hay 72 etiquetas de token únicas


In [7]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances,
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [8]:
snips_dataset['train'][0]

{'utterance': 'find an album called burnout: the ultimate drag race challenge',
 'label': 'SearchCreativeWork',
 'tokens': ['find',
  'an',
  'album',
  'called',
  'burnout:',
  'the',
  'ultimate',
  'drag',
  'race',
  'challenge'],
 'token_labels': [2, 2, 64, 2, 62, 57, 57, 57, 57, 57]}

In [9]:
# Es posible que las "token_labels" dados no coincidan con la tokenización de  BERT, por lo que
# esta función los asignará a la tokenización que usa BERT
# -100 es un reservado para etiquetas en las que no queremos calcular pérdidas
# tratando de predecir tokens como CLS o SEP o de palabras divididas en subtokens

def tokenize_and_align_labels(examples):
    """ 
    Devuelve versiones tokenizadas "alineadas" con BERT, 
    Pero solo para los tokens utiles 
    Tokens especiales y subtokens de una palabra = -100
    Ej Googleplex :
    Google  B-Service
    ##plex  I-Services
    ##plex = -100 Para que este subtoken sea obviado durante el entrenamiento y evaluacion
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Asigna tokens a su palabra respectiva.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Configura los tokens especiales a -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Solo etiqueta el primer token de una palabra sin son varios.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # CLS y SEP son etiquetados como -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
# asigna nuestro dataset  para ser de clasificación de tokens
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

In [11]:
tok_clf_tokenized_snips['train'][0]

{'utterance': 'find an album called burnout: the ultimate drag race challenge',
 'label': 'SearchCreativeWork',
 'tokens': ['find',
  'an',
  'album',
  'called',
  'burnout:',
  'the',
  'ultimate',
  'drag',
  'race',
  'challenge'],
 'token_labels': [2, 2, 64, 2, 62, 57, 57, 57, 57, 57],
 'input_ids': [101,
  1525,
  1126,
  1312,
  1270,
  6790,
  3554,
  131,
  1103,
  10010,
  8194,
  1886,
  4506,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 2, 2, 64, 2, 62, -100, -100, 57, 57, 57, 57, 57, -100]}

In [12]:
# Borramos columnas que no nos interesan ahora. Solo queremos los 3 nuevos : input_ids,attention_mask y labels
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [13]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(unique_token_labels)
)

# Set our label dictionary
tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tok_clf_model.config.id2label[0], tok_clf_model.config.id2label[1]

('B-timeRange', 'B-location_name')

In [16]:
import os
# os.makedirs("/content/drive/MyDrive/snips_clf/resultstokens", exist_ok=True)
os.makedirs("./snips_clf/resultstokens", exist_ok=True)

In [17]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/resultstokens",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    report_to="all",
)

# Define the trainer:

trainer = Trainer(
    model=tok_clf_model,
    args=training_args,
    train_dataset=tok_clf_tokenized_snips['train'],
    eval_dataset=tok_clf_tokenized_snips['test'],
    data_collator=tok_data_collator
)

In [18]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 4.142727375030518,
 'eval_runtime': 4.7545,
 'eval_samples_per_second': 550.428,
 'eval_steps_per_second': 17.247}

In [19]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 65,246,280


Epoch,Training Loss,Validation Loss
1,0.2131,0.178138
2,0.0788,0.132381


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/resultstokens/checkpoint-328
Configuration saved in ./snips_clf/resultstokens/checkpoint-328/config.json
Model weights saved in ./snips_clf/resultstokens/checkpoint-328/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/resultstokens/checkpoint-656
Configuration saved in ./snips_clf/resultstokens/checkpoint-656/config.json
Model weights saved in ./snips_clf/resultstokens/checkpoint-656/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/resultstokens/checkpoint-656 (score: 0.13238124549388885).


TrainOutput(global_step=656, training_loss=0.38308826561381176, metrics={'train_runtime': 105.3456, 'train_samples_per_second': 198.717, 'train_steps_per_second': 6.227, 'total_flos': 128941016404032.0, 'train_loss': 0.38308826561381176, 'epoch': 2.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.13238124549388885,
 'eval_runtime': 3.708,
 'eval_samples_per_second': 705.777,
 'eval_steps_per_second': 22.115,
 'epoch': 2.0}

In [21]:
import torch

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer,device=device)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': 0.9344614,
  'index': 3,
  'word': 'Here',
  'start': 11,
  'end': 15},
 {'entity': 'I-entity_name',
  'score': 0.9448137,
  'index': 4,
  'word': 'We',
  'start': 16,
  'end': 18},
 {'entity': 'I-entity_name',
  'score': 0.9374171,
  'index': 5,
  'word': 'Go',
  'start': 19,
  'end': 21},
 {'entity': 'B-artist',
  'score': 0.87148046,
  'index': 7,
  'word': 'Di',
  'start': 25,
  'end': 27},
 {'entity': 'I-artist',
  'score': 0.74669,
  'index': 8,
  'word': '##sp',
  'start': 27,
  'end': 29},
 {'entity': 'I-entity_name',
  'score': 0.47101295,
  'index': 9,
  'word': '##atch',
  'start': 29,
  'end': 33},
 {'entity': 'B-playlist_owner',
  'score': 0.9956552,
  'index': 11,
  'word': 'my',
  'start': 37,
  'end': 39},
 {'entity': 'B-playlist',
  'score': 0.99673647,
  'index': 12,
  'word': 'road',
  'start': 40,
  'end': 44},
 {'entity': 'I-playlist',
  'score': 0.9954615,
  'index': 13,
  'word': 'trip',
  'start': 45,
  'end': 49},
 {'enti

In [23]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer, device=device)
pipe('Rate the doog food 5 out of 5')

[{'entity': 'B-object_name',
  'score': 0.8927987,
  'index': 2,
  'word': 'the',
  'start': 5,
  'end': 8},
 {'entity': 'I-object_name',
  'score': 0.975715,
  'index': 3,
  'word': 'do',
  'start': 9,
  'end': 11},
 {'entity': 'I-object_name',
  'score': 0.9771392,
  'index': 4,
  'word': '##og',
  'start': 11,
  'end': 13},
 {'entity': 'I-object_name',
  'score': 0.984244,
  'index': 5,
  'word': 'food',
  'start': 14,
  'end': 18},
 {'entity': 'B-rating_value',
  'score': 0.9961979,
  'index': 6,
  'word': '5',
  'start': 19,
  'end': 20},
 {'entity': 'B-best_rating',
  'score': 0.7379682,
  'index': 9,
  'word': '5',
  'start': 28,
  'end': 29}]