In [None]:
from glob import glob
import os
from datasets import Dataset
from datasets import load_dataset
import pandas as pd

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
bert_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"

In [None]:
import json, requests
from bs4 import BeautifulSoup
from shutil import copyfile

classes = ['has_codingsystem','code_of','has_code','negative']
def read_sample(file):
    df = pd.read_csv(file,sep='\t')
    file_dict = {'text':[],'label':[]}
    file_dict['text']=df['Sentence']
    file_dict['label']=df['Relation']
    return file_dict

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(bert_name)
def preprocess_function(examples):
    return tokenizer(examples["text"],padding=True,truncation=True,max_length=512)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
device = torch.device("cuda")
from datasets import load_metric
from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {'f1':f1_score(labels, predictions, average='weighted')}

def train_main(output_dir,model):
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=1e-6,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        evaluation_strategy="epoch",
        num_train_epochs=20,
        weight_decay=0.01,
        load_best_model_at_end=True,
        save_strategy = "epoch",
        metric_for_best_model = 'f1'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['valid'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics = compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    trainer.save_model()
    
def evaluation(model):
    allpreds = []
    alllabels = []
    for instance in (tokenized_dataset['test']):
        inputs = tokenizer(instance['text'],padding=True,truncation=True,max_length=512, return_tensors="pt").to(device)
        labels = torch.tensor([1]).unsqueeze(0).to(device)
        #print (labels)
        outputs = model(**inputs, labels=labels)
        loss, logits = outputs[:2]

        alllabels.append(instance['label'])
        allpreds.append(list(logits[0]).index(max(logits[0])))
    from sklearn.metrics import classification_report
    report = classification_report(alllabels, allpreds, target_names = classes, digits=4)
    print (report)

### human annotation only

In [None]:
train_files = './data/RE/human_annotated/train.csv'
valid_files = './data/RE/human_annotated/dev.csv'
test_files = './data/RE/human_annotated/test.csv'

train_dict = read_sample(train_files)
test_dict = read_sample(test_files)
valid_dict = read_sample(valid_files)

train = Dataset.from_dict(train_dict)
test = Dataset.from_dict(test_dict)
valid = Dataset.from_dict(valid_dict)

imdb = load_dataset("imdb")

all_dataset = imdb
all_dataset['train']=train
all_dataset['valid']=valid
all_dataset['test']=test
all_dataset.pop('unsupervised')

tokenized_dataset = all_dataset.map(preprocess_function, batched=True)
output_dir = "./models/RE_BiomedBERT/"
model = AutoModelForSequenceClassification.from_pretrained(bert_name, num_labels=len(classes)).to(device)
train_main(output_dir,model)
evaluation(model)

### synthetic

In [None]:
train_files = './data/RE/synthetic/train.csv'
valid_files = './data/RE/synthetic/valid.csv'
test_files = './data/RE/human_annotated/test.csv'

train_dict = read_sample(train_files)
valid_dict = read_sample(valid_files)
test_dict = read_sample(test_files)

train = Dataset.from_dict(train_dict)
valid = Dataset.from_dict(valid_dict)
test = Dataset.from_dict(test_dict)


imdb = load_dataset("imdb")

all_dataset = imdb
all_dataset['train']=train
all_dataset['valid']=valid
all_dataset['test']=test
all_dataset.pop('unsupervised')

tokenized_dataset = all_dataset.map(preprocess_function, batched=True)

In [None]:
from collections import Counter
classes = ['has_codingsystem','code_of','has_code','negative']

print (Counter(train['label']))
print (Counter(valid['label']))

In [None]:
output_dir = "./models/RE_BiomedBERT_synthetic/"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(bert_name, num_labels=len(classes)).to(device)
train_main(output_dir,model)

In [None]:
train_files = './data/RE/human_annotated/train.csv'
valid_files = './data/RE/human_annotated/dev.csv'
test_files = './data/RE/human_annotated/test.csv'

train_dict = read_sample(train_files)
test_dict = read_sample(test_files)
valid_dict = read_sample(valid_files)

train = Dataset.from_dict(train_dict)
test = Dataset.from_dict(test_dict)
valid = Dataset.from_dict(valid_dict)

imdb = load_dataset("imdb")

all_dataset = imdb
all_dataset['train']=train
all_dataset['valid']=valid
all_dataset['test']=test
all_dataset.pop('unsupervised')
tokenized_dataset = all_dataset.map(preprocess_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(output_dir, num_labels=len(classes)).to(device)
output_dir = "./models/RE_BiomedBERT_synthetic_continual/"
train_main(output_dir,model)
evaluation(model)

In [None]:
allpreds = []
alllabels = []
for instance in (tokenized_dataset['test']):
    inputs = tokenizer(instance['text'],padding=True,truncation=True,max_length=512, return_tensors="pt").to(device)
    labels = torch.tensor([1]).unsqueeze(0).to(device)
    #print (labels)
    outputs = model(**inputs, labels=labels)
    loss, logits = outputs[:2]

    alllabels.append(instance['label'])
    allpreds.append(list(logits[0]).index(max(logits[0])))

In [None]:
with open('./RE_output.txt','w') as f:
    for text, gold, prediction in zip(test_dict['text'],alllabels,allpreds):
        f.write(text+'\t'+classes[gold]+'\t'+classes[prediction]+'\n')