In [95]:
import torch
import torch.nn as nn
from crf import CRF
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import DistilBertModel

import constant as config
class BERT_LSTM_Joint(nn.Module):
    def __init__(self, config, bert=None, distill=False):
        super(BERT_LSTM_Joint, self).__init__()
        
        
        #별도의 BERT모델을 지정하지 않으면 SKT KoBERT를 Default로 지정한다. 
        self.bert = bert
        self.distill=distill
        if bert is None:
            if self.distill == True:
                self.bert = DistilBertModel.from_pretrained('monologg/distilkobert')
            else:
                self.bert, self.vocab  = get_pytorch_kobert_model()
                
            for param in self.bert.parameters():
                param.requires_grad = True
            
        
        self.dropout = nn.Dropout(config.dropout)
        self.crf_linear = nn.Linear(config.hidden_size, config.num_entity)
        self.intent_classifier = nn.Linear(config.hidden_size, config.num_intent)
        self.bilstm  = nn.LSTM(config.hidden_size, config.hidden_size //2, 
                               batch_first=True, bidirectional=True )
        self.crf = CRF(num_tags=config.num_entity, batch_first=True)
    
    
    #Sentence의 길이만큼만 Attention을 취하기 위해 Mask를 생성한다.
    def get_attention_mask(self, input_ids, valid_length):
        attention_mask = torch.zeros_like(input_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()
    
    def forward(self, input_ids, valid_length, token_type_ids, entity=None, intent=None):
        attention_mask = self.get_attention_mask(input_ids, valid_length)
        
        #all_encoder_layers는 BERT의 output
        

        if self.distill==True:
            outputs = self.bert(input_ids=input_ids.long(), 
                                attention_mask=attention_mask) # (batch, maxlen, hidden)
            
            all_encoder_layers, pooled_output = outputs[0], outputs[0][:,0,:]
            
        else:
            all_encoder_layers, pooled_output = self.bert(input_ids=input_ids.long(),
                                                      token_type_ids=token_type_ids,
                                                      attention_mask=attention_mask)

        # Intent
        cls_out = pooled_output
        # print('cls_out')
        # print(cls_out.size())
        cls_out_drop = self.dropout(cls_out)
        # print('cls_out_drop')
        # print(cls_out_drop.size())
        logits = self.intent_classifier(cls_out_drop)
        
        # Entity on CRF
        last_encoder_layer = all_encoder_layers
        drop = self.dropout(last_encoder_layer)
        output, hc = self.bilstm(drop)
        linear = self.crf_linear(output)
        tag_seq = self.crf.decode(linear)

        # For training
        if entity is not None:
            log_likelihood = self.crf(linear, entity)       
            return log_likelihood, tag_seq, logits
        
        # For inference
        else: 
            confidence = self.crf.compute_confidence(linear, tag_seq)
            return tag_seq, confidence, logits
               

In [74]:
mask = torch.zeros(10,10)

In [79]:
length=8
for i in range(length):
    mask[i][:length]=1

In [80]:
mask

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [87]:
>>> from tokenization_kobert import KoBertTokenizer
>>> tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일
>>> tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")
['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]']
>>> tokenizer.convert_tokens_to_ids(['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]'])
[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3]

>>> from tokenization_kobert import KoBertTokenizer
>>> tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일
>>> tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")
['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]']
>>> tokenizer.convert_tokens_to_ids(['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]'])


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=371391.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=77779.0), HTML(value='')))




[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3]

In [91]:
batch=[]

for i in range(3):
    batch.append(tokenizer.convert_tokens_to_ids(['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]']))

In [94]:
input_ids= torch.tensor(batch)

In [100]:
def get_attention_mask(input_ids, valid_length):
    attention_mask = torch.zeros_like(input_ids)
    for i, v in enumerate(valid_length):
        attention_mask[i][:v] = 1
    return attention_mask.float()



In [104]:
mask = get_attention_mask(input_ids,[4,6,7])

In [103]:
distill_Bert= DistilBertModel.from_pretrained('monologg/distilkobert')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=113629967.0), HTML(value='')))




In [115]:
distill_Bert(input_ids,mask)[0][:,0,:]

tensor([[ 0.4385,  0.1492,  0.4601,  ..., -0.6982, -0.5436, -0.4415],
        [ 0.3146,  0.2004,  0.6604,  ..., -0.5571, -0.1527, -0.3555],
        [ 0.3893,  0.2564,  0.4730,  ..., -0.4671, -0.4400, -0.4779]],
       grad_fn=<SliceBackward>)