In [1]:
#If you want to run this code on Colab
# Upload the file re.zip to your Google Drive first

from google.colab import drive
drive.mount('/content/drive')

# Copy re.zip from the specified directory in Google Drive to the current directory
!cp "./drive/MyDrive/re.zip" "./"

Mounted at /content/drive


If you want to run this code locally:
1. Download re.zip first.
2. Change the directory to where re.zip is located.
  Import the os module.
  Use os.chdir("your directory") to change the directory.
Replace "your directory" with the path to your desired directory.

 import os
 os.chdir("your directory")

In [2]:
# unzip on colab
!unzip ./re.zip -d .
# install module
!pip install ./re/open_nre-0.1.1-py3-none-any.whl

Archive:  ./re.zip
   creating: ./re/
   creating: ./re/benchmark/
   creating: ./re/benchmark/semeval/
  inflating: ./re/benchmark/semeval/semeval_rel2id.json  
  inflating: ./re/benchmark/semeval/semeval_test.txt  
  inflating: ./re/benchmark/semeval/semeval_train.txt  
  inflating: ./re/benchmark/semeval/semeval_val.txt  
   creating: ./re/ckpt/
  inflating: ./re/ckpt/semeval_bert-base-uncased_entity.pth.tar  
  inflating: ./re/ckpt/semeval_cnn.pth.tar  
  inflating: ./re/open_nre-0.1.1-py3-none-any.whl  
   creating: ./re/pretrain/
   creating: ./re/pretrain/bert-base-uncased/
  inflating: ./re/pretrain/bert-base-uncased/config.json  
  inflating: ./re/pretrain/bert-base-uncased/pytorch_model.bin  
  inflating: ./re/pretrain/bert-base-uncased/vocab.txt  
   creating: ./re/pretrain/glove/
  inflating: ./re/pretrain/glove/glove.6B.50d_mat.npy  
  inflating: ./re/pretrain/glove/glove.6B.50d_word2id.json  
Processing ./re/open_nre-0.1.1-py3-none-any.whl
Installing collected packages: o

In [3]:
import os
import logging
import torch
import opennre
from opennre import encoder, model, framework
import json
import numpy as np

Please refer to the readme for the correct configuration of the dataset and checkpoint file paths.

Configure the relevant parameters for the dataset and the model. base_config is a number of generic parameters including the run directory and the dataset location, while the rest are required for the inference of the models and for running the framework. The final model_config will inherit from base_config. Customise this as needed.

In [4]:
base_config = {
    "dataset": "semeval",
    "train_file": "",
    "val_file": "",
    "test_file": "",
    "rel2id_file": "",
    "root_path": "./re",
    "only_test": True,
}

cnn_config = {
    "model_type": "cnn",
    "ckpt": ""
}

bert_config = {
    "model_type": "bert",
    "ckpt": ""
}

def merge_configs(base, specific):
    return {**base, **specific}

Load all datasets and pre-training files. If you need to customise the path, you can modify it here.

In [5]:
dataset_path = os.path.join(base_config["root_path"], "benchmark", base_config["dataset"])
base_config["train_file"] = os.path.join(dataset_path, f"{base_config['dataset']}_train.txt")
base_config["val_file"] = os.path.join(dataset_path, f"{base_config['dataset']}_val.txt")
base_config["test_file"] = os.path.join(dataset_path, f"{base_config['dataset']}_test.txt")
base_config["rel2id_file"] = os.path.join(dataset_path, f"{base_config['dataset']}_rel2id.json")
word2id = json.load(open(os.path.join(base_config["root_path"], 'pretrain/glove/glove.6B.50d_word2id.json')))
word2vec = np.load(os.path.join(base_config["root_path"], 'pretrain/glove/glove.6B.50d_mat.npy'))

Ensure the 'ckpt' directory exists and meets naming requirements.

In [6]:
ckpt_dir = os.path.join(base_config["root_path"], "ckpt")
os.makedirs(ckpt_dir, exist_ok=True)
cnn_config["ckpt"] = os.path.join(ckpt_dir, f"{base_config['dataset']}_cnn.pth.tar")
bert_config["ckpt"] = os.path.join(ckpt_dir, f"{base_config['dataset']}_bert-base-uncased_entity.pth.tar")

In [7]:
for element in base_config:
    print(f"{element}:{base_config[element]}")

dataset:semeval
train_file:./re/benchmark/semeval/semeval_train.txt
val_file:./re/benchmark/semeval/semeval_val.txt
test_file:./re/benchmark/semeval/semeval_test.txt
rel2id_file:./re/benchmark/semeval/semeval_rel2id.json
root_path:./re
only_test:True


load_model first loads the relation to id file and the corresponding encoder in the opennre package according to the kind of model. Subsequently they are passed as parameters to the classifier SoftmaxNN for sentence-level relation extraction. The function returns as the model instance.

In [8]:
def load_model(config):
    rel2id = json.load(open(config["rel2id_file"]))

    if config["model_type"] == "cnn":
        sentence_encoder = opennre.encoder.PCNNEncoder(
            token2id=word2id,
            max_length=128,
            word_size=50,
            position_size=5,
            hidden_size=230,
            blank_padding=True,
            kernel_size=3,
            padding_size=1,
            word2vec=word2vec,
            dropout=0.5
        )
    elif config["model_type"] == "bert":
        sentence_encoder = opennre.encoder.BERTEntityEncoder(
            max_length=128,
            pretrain_path="bert-base-uncased",
            mask_entity=False
        )
    else:
        raise NotImplementedError("Model type not supported")

    model_instance = opennre.model.SoftmaxNN(sentence_encoder, len(rel2id), rel2id)

    return model_instance

load_framework loads the class SentenceRE for evaluating models from the package opennre. It returns a framework instance containing the functions eval_model and load_state_dict for evaluation.

In [9]:
def load_framework(config, model):
    if config["model_type"] == "cnn":
        framework_instance = opennre.framework.SentenceRE(
                train_path=config["train_file"],
                val_path=config["val_file"],
                test_path=config["test_file"],
                model=model,
                ckpt=config["ckpt"],
                # batch_size=config["batch_size"],
                # max_epoch=config["max_epoch"],
                # lr=config["lr"],
                # weight_decay=1e-5,  # Assuming a standard value; adjust as needed
                # opt='sgd'
            )

    elif config["model_type"] == "bert":
        framework_instance = opennre.framework.SentenceRE(
                train_path=config["train_file"],
                val_path=config["val_file"],
                test_path=config["test_file"],
                model=model,
                ckpt=config["ckpt"],
                lr=2e-5,
                batch_size=64,
                max_epoch=3,
                opt='adamw'
            )
    else:
        raise NotImplementedError("framework not supported")

    return framework_instance

Merge configuration files at different levels. Loading a model to a specific instance. The model which is trained by us is available for download from the link in Readme.

In [10]:
cnn_config = merge_configs(base_config, cnn_config)
bert_config = merge_configs(base_config, bert_config)

cnn_model = load_model(cnn_config)
bert_model = load_model(bert_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

This is snippet to assist in determining if the GPU can be used properly for inference, not necessary.

In [11]:
flag = torch.cuda.is_available()
ngpu= 1
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
print("device: ",device)

if flag:
    cnn_model = cnn_model.cuda()
    bert_model = bert_model.cuda()
    print("GPU: ",torch.cuda.get_device_name(0))

device:  cuda:0
GPU:  Tesla T4


Loading the evaluation framework to a specific instance. At the same time the test set is loaded, so the framework instance

 outputs essential details about the test set.

In [12]:
cnn_framework = load_framework(cnn_config, cnn_model)
bert_framwork = load_framework(bert_config, bert_model)



Functions for evaluating the model. The checkpoint file for the trained model is loaded here. This means that if the model is not final, it can be further fine-tuned for better performance. Meanwhile the function get the results of the evaluation and formats the output. For the different models, the four metrics accuracy, micro precision, micro recall and micro F1 are used here uniformly for the results. This enables a fairer comparison of the weighted assessment results of the models for different quantities of relations.

In [13]:
def eval_model(config, framework):
    if config["only_test"]:
        framework.load_state_dict(torch.load(config["ckpt"])['state_dict'])
        result = framework.eval_model(framework.test_loader)
        if config["model_type"] == "cnn":
            print('Test set results on cnn:')
            print('Accuracy: {}'.format(result['acc']))
            print('Micro precision: {}'.format(result['micro_p']))
            print('Micro recall: {}'.format(result['micro_r']))
            print('Micro F1: {}'.format(result['micro_f1']))

        elif config["model_type"] == "bert":
            print('Test set results on bert:')
            print('Accuracy: {}'.format(result['acc']))
            print('Micro precision: {}'.format(result['micro_p']))
            print('Micro recall: {}'.format(result['micro_r']))
            print('Micro F1: {}'.format(result['micro_f1']))

        else:
            raise NotImplementedError("evaluation not supported")

In [14]:
eval_model(cnn_config, cnn_framework)

100%|██████████| 85/85 [00:02<00:00, 29.84it/s, acc=0.683]


Test set results on cnn:
Accuracy: 0.6831063673168937
Micro precision: 0.7566954153427145
Micro recall: 0.7366327883340699
Micro F1: 0.7465293327362292


In [15]:
eval_model(bert_config, bert_framwork)

100%|██████████| 43/43 [00:18<00:00,  2.39it/s, acc=0.831]

Test set results on bert:
Accuracy: 0.8306956201693044
Micro precision: 0.8696785403996524
Micro recall: 0.8846663720724701
Micro F1: 0.8771084337349397





Users can use both of the previous models to extract the relation of their own input sentences, but they need to follow the input python dictionary format: The dictionary consists of three key components:
- "token" is a list of strings representing the tokenized sentence. Each element in the list corresponds to a word or punctuation mark in the original sentence.
- "h" is a dictionary representing the head entity in the relation. It includes two keys:
    - "name": A string indicating the name of the head entity.
    - "pos": A list of two integers marking the start and end positions of the head entity in the tokenized sentence, using zero-based indexing.
- "t" is a dictionary representing the tail entity in the relation, structured similarly to the "h" dictionary:
    - "name": A string indicating the name of the tail entity.
    - "pos": A list of two integers marking the start and end positions of the tail entity in the tokenized sentence.

The results returned are the relations learned during training and confidence marks.

In [16]:
print(cnn_model.infer({"token": ["this", "thesis", "defines", "the", "clinical", "characteristics", "of", "amyloid", "disease", "."], "h": {"name": "thesis", "pos": [1, 2]}, "t": {"name": "clinical characteristics", "pos": [4, 6]}}
))
print(cnn_model.infer({"token": ["the", "order", ",", "issued", "nov.", "16", "by", "eric", "landoll", ",", "the", "city", "'s", "code", "administrator", ",", "states", "that", "the", "roof", "of", "the", "building", "is", "allowing", "rain", "to", "fall", "into", "the", "interior", "of", "the", "building", "and", "because", "of", "this", "the", "interior", "floors", "are", "structurally", "unsound", "and", "there", "is", "mold", "and", "mildew", "throughout", "the", "building", "causing", "health", "concerns", "for", "anyone", "entering", "the", "building", "."], "h": {"name": "roof", "pos": [19, 20]}, "t": {"name": "building", "pos": [22, 23]}}
))

('Message-Topic(e1,e2)', 0.9395318031311035)
('Component-Whole(e1,e2)', 0.922989547252655)


In [18]:
print(bert_model.infer({"token": ["this", "thesis", "defines", "the", "clinical", "characteristics", "of", "amyloid", "disease", "."], "h": {"name": "thesis", "pos": [1, 2]}, "t": {"name": "clinical characteristics", "pos": [4, 6]}}
))
print(bert_model.infer({"token": ["the", "order", ",", "issued", "nov.", "16", "by", "eric", "landoll", ",", "the", "city", "'s", "code", "administrator", ",", "states", "that", "the", "roof", "of", "the", "building", "is", "allowing", "rain", "to", "fall", "into", "the", "interior", "of", "the", "building", "and", "because", "of", "this", "the", "interior", "floors", "are", "structurally", "unsound", "and", "there", "is", "mold", "and", "mildew", "throughout", "the", "building", "causing", "health", "concerns", "for", "anyone", "entering", "the", "building", "."], "h": {"name": "roof", "pos": [19, 20]}, "t": {"name": "building", "pos": [22, 23]}}
))

('Message-Topic(e1,e2)', 0.9929176568984985)
('Component-Whole(e1,e2)', 0.9707645773887634)


 Compare and Analyse:  

   Combining the experimental process and the final data obtained, the advantages and disadvantages of the two models can be summarized. The strengths of the BERT model, which the PCNN needs to improve, are the following:
1.BERT is able to capture long-distance dependencies between words and complex contextual information more comprehensively through its bi-directional Transformer structure, leading to a deeper understanding of the semantics of sentences. Meanwhile BERT's pre-training process covers a large amount of textual data, allowing the model to learn a wide range of linguistic knowledge before being fine-tuned for a specific task. These help the BERT model to better handle complex linguistic phenomena and improve the accuracy of relation extraction. PCNN focuses mainly on local feature extraction and may not be able to fully understand the overall meaning of the sentence.
2.The BERT model has better generalization ability due to its deep bi-directional structure and extensive pre-training. It is able to effectively handle texts from different domains and styles. Whereas PCNN may require more adaptation to specific domains or datasets. The strategy of utilizing sub-word units (e.g. WordPiece) enables BERT to handle rare words and polysemous words more effectively.
3.BERT, as an end-to-end model, is able to learn directly from raw text to final relational labels without the need for complex feature engineering or preprocessing steps, which simplifies the model training process and potentially improves performance.

As good as BERT is at RE tasks, it still has some flaws:
1.BERT requires larger computational resources, including GPU memory and processing power, due to its deep Transformer architecture. In contrast, the PCNN model has a relatively simple structure and lower resource requirements, making it more suitable for resource-constrained environments.
2.During the training process, BERT trained for 3 epochs, but the training time was the same or even longer than PCNN training for 100 epochs. The number of parameters in the BERT model far exceeds that of a traditional CNN model, which means that more time is required in training and applying the model, especially on large-scale datasets.
3.BERT has a higher model complexity, not only because of its high number of parameters, but also because of the complexity of its Transformer architecture. This may lead to increased difficulty in model debugging, optimisation and understanding, whereas PCNN may be easier to deal with in these areas due to its relatively simple architecture.
