In [1]:
from bs4 import BeautifulSoup
import os
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, BasicTokenizer, BertTokenizerFast
import torch.optim as optim
from tqdm import tqdm
from IPython.core.debugger import set_trace

In [2]:
ner_data_dir = "/home/wangyucheng/opt/data/competition/NLP/ner_research"
genia_path = os.path.join(ner_data_dir, "GENIA_term_3.02", "GENIAcorpus3.02.xml")
pretrained_model_home = "/home/wangyucheng/opt/transformers_models_h5"

In [3]:
model_path = os.path.join(pretrained_model_home, "distilbert-base-cased")
tokenizer = BertTokenizerFast.from_pretrained(model_path, add_special_tokens = False, do_lower_case = False)

# Genia

In [4]:
soup = BeautifulSoup(open(genia_path, "r", encoding = "utf-8"), "lxml")

In [5]:
article_list = soup.select("set > article")

In [6]:
len(article_list)

2000

In [7]:
# max token nums: article
max_tok_num = 0
for art in tqdm(article_list):
    title = art.select("title")
    assert len(title) == 1
    title = title[0].get_text()
    
    abstract = art.select("abstract")
    assert len(abstract) == 1
    abstract = abstract[0].get_text()
    
    article = title + "\n" + abstract
    tok_num = len(tokenizer.tokenize(article))
    max_tok_num = max(max_tok_num, tok_num)
max_tok_num

100%|██████████| 2000/2000 [00:12<00:00, 161.24it/s]


986

In [8]:
# max token nums: sentence
max_tok_num = 0
for art in tqdm(article_list):
    sens = art.select("sentence")
    for sen in sens:
        tok_num = len(tokenizer.tokenize(sen.get_text()))
        max_tok_num = max(max_tok_num, tok_num)
max_tok_num

100%|██████████| 2000/2000 [00:09<00:00, 213.96it/s]


373

In [9]:
# max token nums: title
max_tok_num = 0
for art in tqdm(article_list):
    title = art.select("title")
    assert len(title) == 1
    title = title[0].get_text()

    tok_num = len(tokenizer.tokenize(title))
    max_tok_num = max(max_tok_num, tok_num)
max_tok_num

100%|██████████| 2000/2000 [00:04<00:00, 425.67it/s]


115

In [10]:
# max token num of term
max_tok_num_term = 0
for art in tqdm(article_list):
    cons_list = art.select("cons")
    for cons in cons_list:
        text = cons.get_text()
        cons_tokens = tokenizer.tokenize(text)
        max_tok_num_term = max(max_tok_num_term, len(cons_tokens))
max_tok_num_term

100%|██████████| 2000/2000 [00:10<00:00, 196.94it/s]


58

In [11]:
# 实体边界没有多余空格
for art in tqdm(article_list):
    cons_list = art.select("cons")
    for cons in cons_list:
        text = cons.get_text()
        if text[0] == " " or text[-1] == " ":
            set_trace()

100%|██████████| 2000/2000 [00:05<00:00, 357.92it/s]


In [12]:
# 检查是否有多个token对应一个字符，类似于韩文的情况
# 没有，那么char span到token span的映射就很简单了，可以先打上char span再打上token span
for art in tqdm(article_list):
    art_text = art.get_text()
    offset_map = tokenizer.encode_plus(art_text, 
                                       return_offsets_mapping = True, 
                                       add_special_tokens = False)["offset_mapping"]
    for ind, sp in enumerate(offset_map):
        if ind == 0:
            continue
        if sp[0] == offset_map[ind - 1][0] and sp[1] == offset_map[ind - 1][1]:
            set_trace()

100%|██████████| 2000/2000 [00:04<00:00, 494.75it/s]


In [14]:
# have a look at sem 
for art in tqdm(article_list):
    for cons in art.select("cons"):
        if "sem" in cons.attrs:
            sem_text = cons["sem"]
            all_sems = re.findall("G#[^\s()]+", sem_text)
#             if "BUT" in sem_text or "AND" in sem_text:
#                 set_trace()
            sem_set = set(all_sems)
            if len(sem_set) > 1:
                set_trace()

100%|██████████| 2000/2000 [00:05<00:00, 351.46it/s]


### Findings

1. 嵌套实体可能不以空格为边界，e.g. deltaNFkappaBdeltaSpl234, deltaNFkappaB, deltaSpl234都是目标实体