In [2]:
! pip install transformers jaconv neologdn tensorflow mecab-python TorchCRF==1.1.0

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting jaconv
  Downloading jaconv-0.3.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting neologdn
  Downloading neologdn-0.5.1.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mecab-python
  Downloading mecab-python-1.0.0.tar.gz (1.3 kB)


In [1]:
import torch
from transformers import BertJapaneseTokenizer
from UTH_BERT.preprocess_text import preprocess as my_preprocess
from UTH_BERT.tokenization_mod import MecabTokenizer, FullTokenizerForMecab

In [2]:
class BERT_CRF_NER(torch.nn.Module):
    
    def __init__(self,config):
        super(BERT_CRF_NER, self).__init__()
        self.bert =BertModel.from_pretrained()
        self.classifier = torch.nn.Linear()
        self.crf = CRF()
        
    def forward(self,input_ids, attention_mask ,token_type_ids=None, label=None):
        
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs[0]) #(batch_size, sequence_size, num_labels)
        pred_labels=self.crf.viterbi_decode(logits[:,1:,:],attention_mask[:,1:])
      
        return pred_labels 

In [3]:
class Inference:
    def __init__(self, tokenizer_path,neologd_path,BERT_CRFModel):
        self.neologd_path=neologd_path
        self.tokenizer_path = tokenizer_path
        self.model = BERT_CRFModel 
        self.device = self.init_device()      
        self.mecab_tokenizer=self.mecab_tokenizer()
        self.tokenizer = self.init_tokenizer()
    
    def init_device(self):
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        return device        
    
    def mecab_tokenizer(self):
        name_token = "＠＠Ｎ"
        mecab_J_medic = './MANBYO_201907_Dic-utf8.dic'
        vocab_file = "./bert_vocab_mc_v1_25000.txt"        
        sub_tokenizer = MecabTokenizer(mecab_ipadic_neologd=self.neologd_path,
                                        mecab_J_medic=mecab_J_medic,
                                        name_token=name_token)
        tokenizer = FullTokenizerForMecab(sub_tokenizer=sub_tokenizer,
                                            vocab_file=vocab_file,
                                            do_lower_case=False)
        return tokenizer
    
    def init_tokenizer(self):   
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            self.tokenizer_path,
            mecab_kwargs={"mecab_option": self.neologd_path}
            )
        return tokenizer    
    
    def inference(self,texts):
        
        pre_processed_texts = [my_preprocess(t) for t in texts]
        tokens = [self.mecab_tokenizer.tokenize(p) for p in pre_processed_texts]
        inputs=self.tokenizer.batch_encode_plus(
            tokens, 
            return_tensors='pt', 
            truncation='only_first', 
            is_split_into_words=True,
            padding='max_length', 
            max_length=128
        )

        with torch.no_grad():
            pred_labels=self.model(**inputs.to(self.device))
            pred_tokens=[]
            for i in range(len(pred_labels)):
                pred_token=[ tokens[i][j] for j in range(len(pred_labels[i])) if pred_labels[i][j]==1 or pred_labels[i][j]==2 ]
                pred_tokens.append(pred_token)

            return  pred_tokens

In [28]:
NEOLOGD = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
UTH_BERT='.UTH_BERT/UTH_BERT_BASE_512_MC_BPE_WWM_V25000_352K'
BERT_CRFModel=torch.load('results/run_2023.1.27_40578/model/model_40578.pt')
Inference=Inference(tokenizer_path=UTH_BERT,neologd_path=NEOLOGD, BERT_CRFModel=BERT_CRFModel)

In [29]:
texts=['胸水は徐々に増加していき、呼吸困難あり。胸腔穿刺ドレナージ(1600ML)実施。',
       #''放射線肺臓炎',
       '胃腸炎を併発しており',
       '嘔気と嘔吐を主訴に緊急受診']
Inference.inference(texts)

[['胸水', '呼吸困難'], ['胃腸炎'], ['嘔気', '嘔吐']]

In [20]:
Inference.mecab_tokenizer.convert_tokens_to_ids(['放射線', '##肺', '##臓', '##炎'])

[1498, 1628, 16196, 5778]

In [46]:
inputs={}
inputs['input_ids']=torch.tensor([[ 2,  1498,   500, 21464,  2394,     3]]).to('cuda')
inputs['token_type_ids']=torch.tensor([[ 0,  0,   0, 0,  0,     0]]).to('cuda')
inputs['attention_mask']=torch.tensor([[ 1,  1,   1, 1,  1,     1]]).to('cuda')
with torch.no_grad():
        pred_labels=Inference.model(**inputs)
pred_labels        

[[1, 2, 2, 0, 0]]

In [33]:
inputs=Inference.tokenizer.batch_encode_plus(
    [['放射線', '肺', '臓', '炎']], #['放射線', '##肺', '##臓', '##炎']
    return_tensors='pt', 
    truncation='only_first', 
    is_split_into_words=True,
    padding='max_length', 
    max_length=128
)
inputs['input_ids'][:,:20]

tensor([[    2,  1498,   500, 21464,  2394,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [21]:
inputs=Inference.tokenizer.batch_encode_plus(
    [['放射線', '#肺', '#臓', '#炎']], #['放射線', '##肺', '##臓', '##炎']
    return_tensors='pt', 
    truncation='only_first', 
    is_split_into_words=True,
    padding='max_length', 
    max_length=128
)
inputs['input_ids'][:,:20]

tensor([[    2,  1498,     1,   500,     1, 21464,     1,  2394,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [22]:
inputs=Inference.tokenizer.batch_encode_plus(
    [['放射線', '##肺', '##臓', '##炎']], #['放射線', '##肺', '##臓', '##炎']
    return_tensors='pt', 
    truncation='only_first', 
    is_split_into_words=True,
    padding='max_length', 
    max_length=128
)
inputs['input_ids'][:,:20]

tensor([[    2,  1498,     1,     1,   500,     1,     1, 21464,     1,     1,
          2394,     3,     0,     0,     0,     0,     0,     0,     0,     0]])

In [23]:
Inference.mecab_tokenizer.tokenize('胃腸炎を併発しており')

['胃腸炎', 'を', '併発', 'し', 'て', 'おり']

In [24]:
Inference.mecab_tokenizer.convert_tokens_to_ids(['胃腸炎', 'を', '併発', 'し', 'て', 'おり'])

[12423, 25, 8928, 16, 13, 78]

In [25]:
inputs=Inference.tokenizer.batch_encode_plus(
    [['胃腸炎', 'を', '併発', 'し', 'て', 'おり']], 
    return_tensors='pt', 
    truncation='only_first', 
    is_split_into_words=True,
    padding='max_length', 
    max_length=128
)
inputs['input_ids'][:,:20]

tensor([[    2, 12423,    25,  8928,    16,    13,    78,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])