In [None]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import pandas as pd

In [None]:
data = load_dataset("tner/bc5cdr")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(data)



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [None]:
def convert_broke_dataframe_to_fixed(broken_frame):
    # print(broken_frame)
    count = 0
    new_store = []
    while count < len(broken_frame):
        try:
            token = eval(broken_frame['tokens'][count])
            tag = eval(broken_frame['tags'][count])
            new_store.append({'tokens': token, 'tags': tag})
        except Exception as err:
            print(f"errored on: {err}")
        finally:
            count += 1
    return pd.DataFrame(new_store)

In [None]:

data
pt_data = pd.read_csv('/content/pt_transformed_data.csv', names=['tokens', 'tags'])

pt_df_train = pt_data[0:5455]
pt_df_test = pt_data[5456:10711]
pt_df_validation = pt_data[10712:]

print(f"pt_df_train: {len(pt_df_train)}")
print(f"pt_df_test: {len(pt_df_test)}")
print(f"pt_df_validation: {len(pt_df_validation)}")

pt_df_train = pt_df_train.reset_index()
pt_df_test = pt_df_test.reset_index()
pt_df_validation = pt_df_validation.reset_index()


df_train = pd.DataFrame(data['train'])
df_validation = pd.DataFrame(data['validation'])
df_test = pd.DataFrame(data['test'])

print(f"df_train size: {len(df_train)}")
print(f"df_validation size: {len(df_validation)}")
print(f"df_test size: {len(df_test)}")

df_train = df_train.append(convert_broke_dataframe_to_fixed(pt_df_train))
df_test = df_test.append(convert_broke_dataframe_to_fixed(pt_df_test))
df_validation = df_validation.append(convert_broke_dataframe_to_fixed(pt_df_validation))

df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_validation = df_validation.reset_index()

print(len(df_train))
print(len(df_validation))
print(len(df_test))


data = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train),
        "test": Dataset.from_pandas(df_test),
        "validation": Dataset.from_pandas(df_validation)
    }
)

pt_df_train: 5455
pt_df_test: 5255
pt_df_validation: 5653
df_train size: 5228
df_validation size: 5330
df_test size: 5865
10683
10983
11120


In [None]:
def shift_label(label):
    if label % 2 == 1:
        label += 1
    return label
  
def label_lookup(label_index):
    table = [
      "O",
      "B-Chem",
      "B-Dis",
      "I-Dis",
      "I-Chem",
      "B-Prior",
      "I-Prior"
    ]
    return table[label_index]

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            new_labels.append(shift_label(labels[word_id]))
    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples["tags"]):
        if 5 in labels:
          print(f"i: {i}, labels: {labels}")
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

i: 228, labels: [0, 0, 0, 0, 0, 5, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 229, labels: [5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 230, labels: [5, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 231, labels: [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 232, labels: [5, 0, 0, 0, 0, 5, 0, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 233, labels: [0, 0, 0, 5, 0, 5, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 234, labels: [0, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0, 5, 6, 0, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 235, labels: [0, 0, 0, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 236, labels: [5, 6, 0, 0, 0, 0, 0, 0

  0%|          | 0/12 [00:00<?, ?ba/s]

i: 865, labels: [5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0]
i: 866, labels: [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 868, labels: [5, 0, 0, 0, 0, 0, 0, 5, 0]
i: 869, labels: [0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 873, labels: [5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 874, labels: [0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0]
i: 875, labels: [5, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 877, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 878, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 879, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 880, lab

  0%|          | 0/11 [00:00<?, ?ba/s]

i: 331, labels: [0, 5, 0, 5, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0]
i: 332, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 333, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 334, labels: [5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 335, labels: [5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 337, labels: [0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 338, labels: [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 339, labels: [5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 340, labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 341, labels: [0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
i: 342, labels: [0, 0, 0, 0, 0, 0, 0, 

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
tf_train_set = tokenized_data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_validation_set = tokenized_data["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  tensor = as_tensor(value)


In [None]:
from transformers import create_optimizer


In [None]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_data["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [None]:
from transformers import TFAutoModelForTokenClassification
model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=7)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForTokenClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [None]:
import tensorflow as tf
model.compile(optimizer=optimizer)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f97d2457b90>

In [None]:
!mkdir -p model
model.save_pretrained('model/')
# model.save('saved_model/my_model')


In [None]:
model.summary()

Model: "tf_distil_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  5383      
                                                                 
Total params: 66,368,263
Trainable params: 66,368,263
Non-trainable params: 0
_________________________________________________________________


In [None]:
!tar -czvf /content/pt_nci_evs_biobert.tar.gz /content/model
from google.colab import files
files.download('/content/pt_nci_evs_biobert.tar.gz') 

tar: Removing leading `/' from member names
/content/model/
/content/model/config.json
/content/model/tf_model.h5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import pipeline

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "chemotherapy"
example = "immunotherapy drugs"
example = "ibrutinib and radiation therapy may reduce"
# example = "hydroxychloroquine is an autophagy inhibitor"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'LABEL_5', 'score': 0.99598926, 'index': 1, 'word': 'ib', 'start': 0, 'end': 2}, {'entity': 'LABEL_6', 'score': 0.9971138, 'index': 2, 'word': '##rut', 'start': 2, 'end': 5}, {'entity': 'LABEL_6', 'score': 0.9977344, 'index': 3, 'word': '##ini', 'start': 5, 'end': 8}, {'entity': 'LABEL_6', 'score': 0.9975877, 'index': 4, 'word': '##b', 'start': 8, 'end': 9}, {'entity': 'LABEL_0', 'score': 0.99338746, 'index': 5, 'word': 'and', 'start': 10, 'end': 13}, {'entity': 'LABEL_5', 'score': 0.99495536, 'index': 6, 'word': 'radiation', 'start': 14, 'end': 23}, {'entity': 'LABEL_6', 'score': 0.9946302, 'index': 7, 'word': 'therapy', 'start': 24, 'end': 31}, {'entity': 'LABEL_0', 'score': 0.99899095, 'index': 8, 'word': 'may', 'start': 32, 'end': 35}, {'entity': 'LABEL_0', 'score': 0.9988029, 'index': 9, 'word': 'reduce', 'start': 36, 'end': 42}]
