In [1]:
from datasets import load_dataset,Dataset
from glob import glob
import os,torch,random
from shutil import copyfile
cache_dir = '/Users/yr255/Downloads/kiwi_development/data/'
model_name = "emilyalsentzer/Bio_ClinicalBERT"

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
template_dataset = load_dataset("wnut_17",cache_dir=cache_dir)
label_list = ['O','B-problem','I-problem','B-treatment','I-treatment','B-test','I-test','B-drug','I-drug']

In [4]:
def data_loader(files):
    data_dict = {'id':[],'tokens':[],'ner_tags':[]}
    i=0
    for file in files:
        with open(file,'r') as f:
            lines = f.read().splitlines()
            tokens = []
            tags = []
            for line in lines:
                if line != '':
                    token,tag = line.split('\t')
                    tokens.append(token)
                    tags.append(label_list.index(tag.replace('B-temporal_expression','O').replace('I-temporal_expression','O').replace('B-social_circumstance','O').replace('I-social_circumstance','O')))
                else:
                    data_dict['id'].append(str(i))
                    data_dict['tokens'].append(tokens)
                    data_dict['ner_tags'].append(tags)
                    tokens = []
                    tags = []
                    i+=1
            data_dict['id'].append(str(i))
            data_dict['tokens'].append(tokens)
            data_dict['ner_tags'].append(tags)
            tokens = []
            tags = []
            i+=1
    return data_dict

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [7]:
id2label = {}
for i,label in enumerate(label_list):
    id2label.update({i:label})
label2id = {}
for i,label in enumerate(label_list):
    label2id.update({label:i})

In [8]:
train_files = glob('/Users/yr255/Downloads/kiwi_development/data/NER_main_BERT_train.bio')
valid_files = glob('/Users/yr255/Downloads/kiwi_development/data/NER_main_BERT_dev.bio')
train = Dataset.from_dict(data_loader(train_files))
valid = Dataset.from_dict(data_loader(valid_files))

template_dataset['train'] = train
template_dataset['validation'] = valid

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,cache_dir=cache_dir)
tokenized_dataset = template_dataset.map(tokenize_and_align_labels, batched=True)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id,cache_dir=cache_dir
)
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"./models/main_NER/",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    do_train = True,
    do_eval = True,
    do_predict = True,
    metric_for_best_model = 'f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

#trainer.train()
#trainer.save_model()

Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### individual test sets

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,cache_dir=cache_dir)
tokenized_dataset = template_dataset.map(tokenize_and_align_labels, batched=True)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
model_dir = "/Users/yr255/Downloads/kiwi_development/BERT/models/main_NER/"
model = AutoModelForTokenClassification.from_pretrained(
    model_dir, num_labels=len(label_list), id2label=id2label, label2id=label2id,cache_dir=cache_dir
)
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=model_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    do_train = False,
    do_eval = False,
    do_predict = True,
    metric_for_best_model = 'f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

  trainer = Trainer(


In [10]:
for dataset in ['i2b2']:

    test_files = glob(f'/Users/yr255/Downloads/kiwi_development/data/NER_main_BERT_test_{dataset}.bio')
    test = Dataset.from_dict(data_loader(test_files))
    template_dataset['test'] = test
    tokenized_dataset = template_dataset.map(tokenize_and_align_labels, batched=True)
    predictions, labels, metrics = trainer.predict(tokenized_dataset['test'], metric_key_prefix="predict")
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    !mkdir ./output/
    with open(f'./output/BERT_main_output_test_prediction_{dataset}.bio','w') as f:
        for sentence, predictions,golds in zip(template_dataset['test']['tokens'], true_predictions, template_dataset['test']['ner_tags']):
            for token,tag,gold in zip(sentence,predictions,golds):
                f.write(f'{token}\t{label_list[gold]}\t{tag}\n')
            f.write('\n')
    !python ../evaluate_jianfu_original.py -lf ./output/BERT_main_output_test_prediction_{dataset}.bio

Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

Map:   0%|          | 0/14502 [00:00<?, ? examples/s]

Map:   0%|          | 0/5040 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/yr255/.netrc
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


mkdir: ./output/: File exists


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


evaluate: ./output/BERT_main_output_test_prediction_i2b2.bio /Users/yr255/Downloads/kiwi_development/merged_gold_pred.eval_score.txt tab ner False 




P(exact)	R(exact)	F1(exact)	P(relax)	R(relax)	F1(relax)	right	right_predict	right_gold	predict	gold	Semantic
0.853	0.823	0.838	0.958	0.917	0.937	623	699	694	730	757	drug
0.794	0.810	0.802	0.904	0.925	0.915	1279	1456	1461	1610	1579	problem
0.814	0.824	0.819	0.895	0.911	0.903	1162	1277	1286	1427	1411	test
0.636	0.639	0.637	0.756	0.764	0.760	281	334	336	442	440	treatment
0.795	0.799	0.797	0.895	0.902	0.898	3345	3766	3777	4209	4187	overall
evaluate done.
