In [None]:
'''
About this you can see more information in the website: https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertModel
'''
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

# return transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

''' 
    About model param
    you can see https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertModel.forward
    author: Well, I think it is no need to understand each params in this function. Because the model we use is 
    trained by others.The only thing we ought to do is understanding the attribute of the output of the function.

    About the return
    author: According to the official web page. The function will return 
    "transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions" or "tuple(torch.FloatTensor)"
    which is comprised various elements depending on the configuration (BertConfig) and inputs.

        output property list
        1. last_hidden_state(torch.FloatTensor of shape (batch_size, sequence_length, hidden_size))
            Sequence of hidden-states at the output of the last layer of the model.
        2. pooler_output(torch.FloatTensor of shape (batch_size, hidden_size))
            hidden-state of the first token of the sequence (classification token) after further processing 
            through the layers used for the auxiliary pretraining task.
            author: [CLS] word_vector
        3. hidden_states (tuple(torch.FloatTensor)
            🔺 it is optional. returned when output_hidden_states=True is passed or when 
            config.output_hidden_states=True)
            Tuple of torch.FloatTensor (one for the output of the embeddings, if the model has an embedding 
            layer, + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size)
        4. attentions(tuple(torch.FloatTensor)
            🔺 it is optional. returned when output_attentions=True is passed or when 
            config.output_attentions=True)
            Attentions weights after the attention softmax, used to compute the weighted average in the 
            self-attention heads.
        5. cross_attentions (tuple(torch.FloatTensor)
            🔺 it is optional.returned when output_attentions=True and config.add_cross_attention=True is 
            passed or when config.output_attentions=True)
            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to 
            compute the weighted average in the cross-attention heads.
        6. past_key_values (tuple(tuple(torch.FloatTensor))
            🔺 it is optional.returned when ...
            Contains pre-computed hidden-states(both optional. When...) that can be used (see past_key_values 
            input) to speed up sequential decoding.
'''

In [1]:
import json

In [50]:
# read json(test)
with open("./ner_output/test_w.json","r") as f:
    data = json.load(f)


In [51]:
print(json.dumps(data,indent=4))

{
    "A1": {
        "type": "disease",
        "alias": "a1"
    },
    "B2": {
        "type": "disease"
    },
    "C1": {
        "type": "disease",
        "alias": "c1"
    },
    "A2": {
        "type": "disease"
    }
}


In [23]:
for key in data:
    print(key)

A1
B2
C1
A2


In [47]:
def merg_or_append(entities, name, entity):
    has_same = False

    for key in entities:
        if(key.lower() == name.lower()):
            has_same = True
            entities[key]['alias'] = name
            break
    
    if(not has_same):
        entities[name] = entity

In [48]:
# write json(complete)

names = ["A1","B2","C1","A2","a1","c1"]
type = "disease"
empty_json = {}
entities = json.loads(json.dumps(empty_json))

for name in names:
    entity = {
        "type":type
    }
    merg_or_append(entities,name,entity)

# print(entities)

with open("./ner_output/test_w.json","w") as file:
    json.dump(entities,file)


In [9]:
# write json(test)

entity = {
    "name":"A1",
    "typy":"disease"
}
with open("./ner_output/test_w.json","w") as file:
    json.dump(entity,file)