# 1 load data

In [1]:
import imp
data_process=imp.load_source('data_process','../data_process.py')
from data_process import get_xml_data

In [2]:
train_sents,train_contexts,train_labels=get_xml_data('../SMP2019/SMP2019_ECISA_Train.xml')
validation_sents,validation_contexts,validation_labels=get_xml_data('../SMP2019/SMP2019_ECISA_Dev.xml')

In [3]:
from transformers import BertModel,BertTokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
bert_name='hfl/chinese-bert-wwm'

In [5]:
bert = BertModel.from_pretrained(bert_name, return_dict=False)
tokenizer = BertTokenizer.from_pretrained(bert_name)

In [6]:
from keras.preprocessing.sequence import pad_sequences
import torch

Using TensorFlow backend.


In [7]:
def get_bertTensor(text_list,MAX_LEN = 128):
    words_idx = []
    for sent in text_list:
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = MAX_LEN,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                       )
        
        words_idx.append(encoded_sent)
    
    words_idx=pad_sequences(words_idx, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    words_masks=[]
    for sent in words_idx:
        att_mask = [int(token_id > 0) for token_id in sent]
        words_masks.append(att_mask)
        
    return torch.tensor(words_idx),torch.tensor(words_masks)

In [8]:
train_sents,train_sents_masks=get_bertTensor(train_sents)
train_contexts,train_contexts_masks=get_bertTensor(train_contexts,MAX_LEN=256)
train_labels=torch.tensor(train_labels)

validation_sents,validation_sents_masks=get_bertTensor(validation_sents)
validation_contexts,validation_contexts_masks=get_bertTensor(validation_contexts,MAX_LEN=256)
validation_labels=torch.tensor(validation_labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [10]:
batch_size = 6
train_data = TensorDataset(train_sents,train_sents_masks,train_contexts,train_contexts_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_sents,validation_sents_masks,validation_contexts,
                                validation_contexts_masks,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# 2 AEN_Bert

In [11]:
from layers.dynamic_rnn import DynamicLSTM
from layers.squeeze_embedding import SqueezeEmbedding
from layers.attention import Attention, NoQueryAttention
from layers.point_wise_feed_forward import PositionwiseFeedForward
import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
class AEN_BERT(nn.Module):
    def __init__(self,bert):
        super(AEN_BERT, self).__init__()

        dropout=0.1
        bert_dim=768    
        hidden_dim=300
        polarities_dim=3
        
        self.drop_path_prob=0.0
        self.bert=bert
        self.dropout = nn.Dropout(dropout)
        self.attn_k = Attention(bert_dim, out_dim=hidden_dim, n_head=8, score_function='mlp', dropout=dropout)
        self.attn_q = Attention(bert_dim, out_dim=hidden_dim, n_head=8, score_function='mlp', dropout=dropout)
        self.ffn_c = PositionwiseFeedForward(hidden_dim, dropout=dropout)
        self.ffn_t = PositionwiseFeedForward(hidden_dim, dropout=dropout)

        self.attn_s1 = Attention(hidden_dim, n_head=8, score_function='mlp', dropout=dropout)
        self.dense = nn.Linear(hidden_dim*3, polarities_dim)
        self.softmax=nn.Softmax(dim=1)
        
    def forward(self, inputs):
#         context, target = inputs[0], inputs[1]
        context_len=128
        target_len=128
        
        target,_=bert(a_input_ids, token_type_ids=None, attention_mask=a_input_mask,)
        context,_=bert(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,)
#         context_len = torch.sum(context != 0, dim=-1)
#         target_len = torch.sum(target != 0, dim=-1)
#         context = self.squeeze_embedding(context, context_len)
#         context, _ = self.bert(context)
        context = self.dropout(context)
        target = self.dropout(target)

        hc, _ = self.attn_k(context, context)
        hc = self.ffn_c(hc)
        ht, _ = self.attn_q(context, target)
        ht = self.ffn_t(ht)

        s1, _ = self.attn_s1(hc, ht)

        hc_mean = torch.div(torch.sum(hc, dim=1), context_len)
        ht_mean = torch.div(torch.sum(ht, dim=1), target_len)
        s1_mean = torch.div(torch.sum(s1, dim=1), context_len)

        x = torch.cat((hc_mean, s1_mean, ht_mean), dim=-1)
        out=self.softmax(self.dense(x))
#         out = torch.argmax(self.softmax(self.dense(x)),dim=1)
        return out

# 3 Train

In [13]:
import random
import sys
import os
import time
import logging
import datetime
from sklearn.metrics import classification_report

In [14]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [15]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
# model = torch.nn.DataParallel(model, device_ids=[0,2,3]).cuda()

In [16]:
if torch.cuda.is_available():     
    device = torch.device("cuda:0")#select gpu

In [17]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [18]:
# bert.cuda()
model=AEN_BERT(bert)
model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [19]:
loss_values = []
epochs=4
for epoch_i in range(epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()
    total_loss=0
    model.train()
    n_correct, n_total, loss_total = 0, 0, 0
    for step,batch in enumerate(train_dataloader):

        a_input_ids  = batch[0].to(device)
        a_input_mask = batch[1].to(device)
        b_input_ids  = batch[2].to(device)
        b_input_mask = batch[3].to(device)
        labels       = batch[4].to(device)
        
        optimizer.zero_grad()
            
        inputs=a_input_ids,a_input_mask,b_input_ids,b_input_mask 
        predict=model(inputs)
        loss=criterion(predict,labels)
        
        total_loss+=loss
        loss.backward()
        optimizer.step()
        
        n_correct += (torch.argmax(predict, -1) == labels).sum().item()
        n_total += len(predict)
        loss_total += loss.item() * len(predict)
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))        
            train_acc = n_correct / n_total
            train_loss = loss_total / n_total
            logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))   
                
        
    


Training...




  Batch    40  of  2,463.    Elapsed: 0:00:13.
loss: 1.0116, acc: 0.5366
  Batch    80  of  2,463.    Elapsed: 0:00:25.
loss: 0.9801, acc: 0.5535
  Batch   120  of  2,463.    Elapsed: 0:00:37.
loss: 0.9584, acc: 0.5799
  Batch   160  of  2,463.    Elapsed: 0:00:49.
loss: 0.9382, acc: 0.6004
  Batch   200  of  2,463.    Elapsed: 0:01:01.
loss: 0.9351, acc: 0.6036
  Batch   240  of  2,463.    Elapsed: 0:01:13.
loss: 0.9322, acc: 0.6051
  Batch   280  of  2,463.    Elapsed: 0:01:25.
loss: 0.9258, acc: 0.6139
  Batch   320  of  2,463.    Elapsed: 0:01:37.
loss: 0.9202, acc: 0.6210
  Batch   360  of  2,463.    Elapsed: 0:01:49.
loss: 0.9147, acc: 0.6265
  Batch   400  of  2,463.    Elapsed: 0:02:01.
loss: 0.9146, acc: 0.6276
  Batch   440  of  2,463.    Elapsed: 0:02:14.
loss: 0.9099, acc: 0.6327
  Batch   480  of  2,463.    Elapsed: 0:02:26.
loss: 0.9084, acc: 0.6355
  Batch   520  of  2,463.    Elapsed: 0:02:38.
loss: 0.9108, acc: 0.6318
  Batch   560  of  2,463.    Elapsed: 0:02:50.
loss