In [1]:
!pip install transformers
!wget https://groups.csail.mit.edu/sls/downloads/movie/trivia10k13test.bio
!wget https://groups.csail.mit.edu/sls/downloads/movie/trivia10k13train.bio

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [2]:
from pathlib import Path
import re
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, TrainingArguments, Trainer
import torch

In [3]:
def read_bio(path):
  file_path = Path(path)
  raw_text = file_path.read_text().strip()
  examples = re.split(r'\n\t?\n', raw_text)
  texts = []
  labels = []
  for example in examples:
    words = re.split('\n', example)
    texts_in_example = []
    labels_in_example = []
    for word in words:
      label, text = re.split('\t', word)
      texts_in_example.append(text)
      labels_in_example.append(label)
    texts.append(texts_in_example)
    labels.append(labels_in_example)
  return texts, labels
texts, tags = read_bio('trivia10k13train.bio')
len(texts)

7816

In [4]:
train_texts, val_texts, train_tags, val_tags= train_test_split(texts, tags, test_size=0.2)
np.array(train_texts).shape

  


(6252,)

In [5]:
' '.join(train_texts[1])

'ava gardner seduces burt lancaster in this 1946 crime drama based on a story by ernest hemingway'

In [15]:
unique_tags = set(tag for example in tags for tag in example)
id2tag = {tag: id for tag, id in enumerate(unique_tags)}
tag2id = {id: tag for tag, id in enumerate(unique_tags)}
print(id2tag)

{0: 'O', 1: 'I-Award', 2: 'B-Director', 3: 'I-Origin', 4: 'I-Plot', 5: 'B-Relationship', 6: 'B-Character_Name', 7: 'I-Director', 8: 'B-Opinion', 9: 'B-Origin', 10: 'B-Soundtrack', 11: 'I-Opinion', 12: 'B-Plot', 13: 'I-Genre', 14: 'I-Year', 15: 'I-Relationship', 16: 'I-Soundtrack', 17: 'I-Quote', 18: 'B-Actor', 19: 'I-Character_Name', 20: 'B-Award', 21: 'B-Quote', 22: 'B-Genre', 23: 'I-Actor', 24: 'B-Year'}


In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [8]:
print(train_encodings.keys())
print(tokenizer.decode(train_encodings['input_ids'][1][0:7]))
print(train_encodings['input_ids'][1][0:7])
print(train_encodings['offset_mapping'][1][0:7])

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])
[CLS] ava gardner seduce
[101, 170, 2497, 176, 2881, 2511, 26317]
[(0, 0), (0, 1), (1, 3), (0, 1), (1, 4), (4, 7), (0, 6)]


In [9]:
def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in example] for example in tags]
  encoded_labels = []
  for example_labels, encoding_offsets in zip(labels, encodings['offset_mapping']):

    encoded_label = np.ones(len(encoding_offsets), dtype=int) * -100
    encoding_offsets = np.array(encoding_offsets)
    try:
      encoded_label[(encoding_offsets[:,0] == 0) & (encoding_offsets[:,1] != 0)] = example_labels
      encoded_labels.append(encoded_label.tolist())
    except:
      print('encoding_offsets')
  return encoded_labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
print(len(train_labels))
print(train_labels[1][0:7])

6252
[-100, 18, -100, 23, -100, -100, 12]


In [10]:
class BioDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels=None):
    self.encodings = encodings
    self.labels = labels
  def __getitem__(self, idx):
    item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
    if self.labels:
      item['labels'] = torch.tensor(self.labels[idx])
    return item
  def __len__(self):
    return len(self.encodings.input_ids)
if 'offset_mapping' in train_encodings.keys():
  train_encodings.pop('offset_mapping')
  val_encodings.pop('offset_mapping')
train_dataset = BioDataset(train_encodings, train_labels)
val_dataset = BioDataset(val_encodings, val_labels)
print(train_dataset.__getitem__(1))

{'input_ids': tensor([  101,   170,  2497,   176,  2881,  2511, 26317,  1116,   171, 12549,
         2495, 26405, 20517,  1107,  1142,  3064,  3755,  3362,  1359,  1113,
          170,  1642,  1118, 14044,  3965,  1204, 23123,  1158,  2787,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [11]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [12]:
training_args = TrainingArguments(
  output_dir='./results',
  num_train_epochs=5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=64,
  warmup_steps=500,
  weight_decay=0.01,
  logging_dir='./logs',
  logging_steps=10
)
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
)

In [13]:
trainer.train()
trainer.save_model('./final_model')

***** Running training *****
  Num examples = 6252
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1955


Step,Training Loss
10,3.1723
20,3.1301
30,3.027
40,2.9003
50,2.6385
60,2.2647
70,1.8608
80,1.496
90,1.2082
100,1.1209


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./final_model
Configuration saved in ./final_model/config.json
Model weights saved in ./final_model/pytorch_model.bin


In [None]:
# setup for prediction
from transformers import DistilBertTokenizerFast,DistilBertForTokenClassification,Trainer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('./final_model')
trainer = Trainer(new_model)
id2tag = {0: 'O', 1: 'I-Award', 2: 'B-Director', 3: 'I-Origin', 4: 'I-Plot', 5: 'B-Relationship', 6: 'B-Character_Name', 7: 'I-Director', 8: 'B-Opinion', 9: 'B-Origin', 10: 'B-Soundtrack', 11: 'I-Opinion', 12: 'B-Plot', 13: 'I-Genre', 14: 'I-Year', 15: 'I-Relationship', 16: 'I-Soundtrack', 17: 'I-Quote', 18: 'B-Actor', 19: 'I-Character_Name', 20: 'B-Award', 21: 'B-Quote', 22: 'B-Genre', 23: 'I-Actor', 24: 'B-Year'}

def predict(text):
  encoded_text = tokenizer([text], padding=True, truncation=True)
  text_ds = BioDataset(encoded_text)
  preds = trainer.predict(text_ds)
  pred = preds.predictions[0]
  predicted_labels = []
  for label_pred in pred:
    max = label_pred.max()
    idx = list(label_pred).index(max)
    predicted_labels.append(id2tag[idx])

  def decodeSubstrings(substringList):
    decoder = lambda substring: tokenizer.decode(substring)
    return list(map(decoder, substringList))
  decodedSubstrings = decodeSubstrings(text_ds.__getitem__(0)['input_ids'].tolist())
  pairedRes = []
  for i in range(len(predicted_labels)):
    pairedRes.append((decodedSubstrings[i],predicted_labels[i]))
  perWordLabels = []
  for subString, label in pairedRes:
    if subString[0:2] != '##':
      perWordLabels.append(label)
  decodedSentence = tokenizer.decode(text_ds.__getitem__(0)['input_ids'])
  sentenceIntoWords = decodedSentence.split(' ')
  pairedWords = []
  for i in range(len(perWordLabels)):
    pairedWords.append((sentenceIntoWords[i],perWordLabels[i]))
  return pairedWords

In [30]:
prediction = predict('what 2012 film reunited director martin mcdonagh with the star of his previous film in bruges colin farrell')
print(prediction)

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


[('[CLS]', 'O'), ('what', 'O'), ('2012', 'B-Year'), ('film', 'O'), ('reunited', 'O'), ('director', 'O'), ('martin', 'B-Director'), ('mcdonagh', 'I-Director'), ('with', 'O'), ('the', 'O'), ('star', 'O'), ('of', 'O'), ('his', 'I-Relationship'), ('previous', 'I-Relationship'), ('film', 'I-Relationship'), ('in', 'I-Relationship'), ('bruges', 'I-Relationship'), ('colin', 'B-Actor'), ('farrell', 'I-Actor'), ('[SEP]', 'O')]
