<a href="https://colab.research.google.com/github/Foruck/sentiment-analysis-demo/blob/master/bert_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving cn_train_data.h5 to cn_train_data.h5
Saving cn_valid_data.h5 to cn_valid_data.h5
Saving en_train_data.h5 to en_train_data.h5
Saving en_valid_data.h5 to en_valid_data.h5
User uploaded file "cn_train_data.h5" with length 47004264 bytes
User uploaded file "cn_valid_data.h5" with length 24217616 bytes
User uploaded file "en_train_data.h5" with length 18164752 bytes
User uploaded file "en_valid_data.h5" with length 9358904 bytes


In [0]:
import sys
import os
import datetime
import logging
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.utils.data
import h5py
import numpy as np
from torch.autograd import Variable
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertAdam

In [0]:
LSTM_HIDDEN_SIZE = 256
embedding_length = 768
Sentence_Max_Length = 128

In [0]:
class bertDataset(torch.utils.data.Dataset):
  def __init__(self, data_path):
    with h5py.File(data_path, 'r') as f:
      self.data = f['data'][:, :]
      self.mask = f['mask'][:, :]
      self.annot = f['annot'][:]
    
  def __len__(self):
    return self.annot.shape[0]
    
  def __getitem__(self, idx):
    return self.data[idx, :], self.mask[idx, :], self.annot[idx]

      
class myBert(torch.nn.Module):
  def __init__(self, embedding_length=768, bert_path='bert-base-uncased', window=[7], classes=2, use_cuda=True):
    super(myBert, self).__init__()
    self.use_cuda = use_cuda
    self.num_filter = len(window)

    # Bert model
    self.bert = BertModel.from_pretrained(bert_path)

    # Convolution layers
    conv1, conv2, conv3, conv4 = [], [], [], []
    for i in range(self.num_filter):
      conv1.append(nn.Conv2d(1, 256, (window[i], embedding_length), stride=(1, 1), padding=(int((window[i] - 1) / 2), 0))) # out n*64*128*1
      conv2.append(nn.Conv1d(256, 128, 3, stride=1, padding=1)) # out n*128*64
      conv3.append(nn.Conv1d(128, 64, 3, stride=1, padding=1)) # out n*64*32
      conv4.append(nn.Conv1d(64, 16, 1, stride=1, padding=0)) # out n*16*16
    self.conv1, self.conv2 = nn.ModuleList(conv1), nn.ModuleList(conv2)
    self.conv3, self.conv4 = nn.ModuleList(conv3), nn.ModuleList(conv4)
    for i in range(self.num_filter):
      init.kaiming_normal_(self.conv1[i].weight.data)
      init.kaiming_normal_(self.conv2[i].weight.data)
      init.kaiming_normal_(self.conv3[i].weight.data)
      init.kaiming_normal_(self.conv4[i].weight.data)
    self.dropout = nn.Dropout(p=0.5)

    # LSTM layers
    self.lstm1 = nn.LSTMCell(embedding_length, LSTM_Hidden_Size) # out n*1*LSTM_Hidden_Size
    self.lstm2 = nn.LSTMCell(embedding_length, LSTM_Hidden_Size)

    # FC layer
    self.fc = nn.Linear(LSTM_Hidden_Size * 2 + 256 * self.num_filter, classes)
    init.kaiming_normal_(self.fc.weight.data)
    self.fc.bias.data.fill_(0)
    
  def forward(self, inputs, mask):
    # Get Features
    inputs = self.bert(inputs, token_type_ids=None, attention_mask=mask, output_all_encoded_layers=False)[0]
    inputs = inputs.unsqueeze(1)
        
    # Go through Bi-LSTM
    n = inputs.shape[0]
    x0 = inputs.squeeze(1)
    if next(self.parameters()).is_cuda and self.use_cuda:
      if not inputs.is_cuda:
        inputs = inputs.cuda()
        x0 = x0.cuda()
        cx1 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hx1 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        cx2 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hx2 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hxs1 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size)).cuda()
        hxs2 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size)).cuda()
      else:
        if inputs.is_cuda:
          inputs = inputs.cpu()
          x0 = x0.cpu()
        cx1 = torch.zeros(n, LSTM_Hidden_Size)
        hx1 = torch.zeros(n, LSTM_Hidden_Size)
        cx2 = torch.zeros(n, LSTM_Hidden_Size)
        hx2 = torch.zeros(n, LSTM_Hidden_Size)
        hxs1 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size))
        hxs2 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size))

    for i in range(x0.shape[1]):
      hx1, cx1 = self.lstm1(x0[:, i, :], (hx1, cx1))
      hxs1[:, i, :] = hx1
      hx2, cx2 = self.lstm1(x0[:, x0.shape[1] - 1 - i, :], (hx2, cx2))
      hxs2[:, i, :] = hx2
    lstm_x = torch.cat((torch.mean(hxs1, 1, True).squeeze(1), torch.mean(hxs2, 1, True).squeeze(1)), 1)

    # Go Through CNN
    x = []
    for i in range(self.num_filter):
      x.append(self.conv1[i](inputs))
      x[i] = x[i].squeeze(3)
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv2[i](x[i])
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv3[i](x[i])
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv4[i](x[i])
      x[i] = x[i].view(n, -1)
      x[i] = self.dropout(x[i])
        
    cnn_x = torch.cat(x, 1)
    x1 = torch.cat((lstm_x, cnn_x), 1)
    x1 = self.dropout(self.fc(x1))
    return x1

In [0]:
def train(model, train_loader, optimizer, logger, print_every=100):
  model = model.train()
  loss_fn = nn.CrossEntropyLoss()
  all_loss = 0.0
  hit, tot, cnt = 0, 0, 0
  for i, (x, mask, target) in enumerate(train_loader):
    x = x.cuda().long()
    mask = mask.cuda().long()
    target = target.cuda().long()
    target = torch.clamp(target, min=0, max=1)
    scores = model(x, mask)
    pred = torch.argmax(scores, dim=1)
    tot = int(x.shape[0])
    hit = torch.sum(target == pred)
    accuracy = float(hit) / tot
    loss = loss_fn(scores, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % print_every == 0:
      logger.info('Iter %d, loss=%.4f, acc=%.4f' % (i, loss, accuracy))
    all_loss += float(loss) * int(tot)
    cnt += tot
  all_loss /= cnt
  logger.info('Iter %d, train_loss=%.4f' % (i, all_loss))
  return model, optimizer

In [0]:
!mkdir log
!mkdir bst_model

In [0]:
torch.set_default_tensor_type('torch.FloatTensor')
device = torch.device("cuda")
end_epoch = 30
tag = 'en'
lr = 5e-5
checkpoint = ''
job_name =  '_'.join([tag, str(lr), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')])
save_path = 'bst_model/' + job_name + '.pth'
log_path = 'log/' + job_name + '.log'
bz = 32

In [0]:
if args.tag == 'cn':
  train_set = bertDataset('cn_train_data.h5')
  train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bz, shuffle=True, num_workers=8)
  valid_set = bertDataset('cn_valid_data.h5')
  valid_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=int(bz / 2), shuffle=True, num_workers=8)
  myModel = myBert(embedding_length, 'bert-base-chinese', use_cuda=args.gpu)
  myModel = myModel.to(device=device)
elif args.tag == 'en':
  train_set = bertDataset('en_train_data.h5')
  train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bz, shuffle=True, num_workers=8)
  valid_set = bertDataset('en_train_data.h5')
  valid_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=int(bz / 2), shuffle=True, num_workers=8)
  myModel = myBert(embedding_length, 'bert-base-uncased', use_cuda=args.gpu)
  myModel = myModel.to(device=device)
  
if args.checkpoint != '':
  state = torch.load(args.checkpoint)
  myModel.load_state_dict(state['model_state'])
  optimizer.load_state_dict(state['optim_state'])
  epoch = state['epoch']
  bst_acc = state['acc']
else:
  epoch = 0
  
ignored_params = list(map(id, myModel.bert.parameters()))
base_params = filter(lambda p: id(p) not in ignored_params, myModel.parameters())
optimizer_grouped_parameters = [{'params': base_params, 'weight_decay': 0.01}, {'params': myModel.bert.parameters(), 'weight_decay': 0.0, 'lr': 1e-6}]
optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup_proportion, t_total=train_set.__len__())

In [0]:
logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler(log_path)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("Start print log")

In [0]:
for i in range(epoch, end_epoch):
  logger.info('=> Epoch %d, lr = %0.6f <=' % (i, args.lr))
  myModel, optimizer = train(myModel, train_loader, optimizer, logger, print_every=50)
  myModel = myModel.eval()
  loss_fn = nn.CrossEntropyLoss()
  hit, tot = 0, 0
  all_loss = 0.0
  for iv, (xv, maskv, targetv) in enumerate(valid_loader):
    xv = xv.cuda().long()
    maskv = maskv.cuda().long()
    targetv = targetv.cuda().long()
    targetv = torch.clamp(targetv, min=0, max=1)
    scores = myModel(xv, maskv)
    loss = loss_fn(scores, targetv)
    pred = torch.argmax(scores, dim=1)
    tot += int(xv.shape[0])
    hit += int(torch.sum(targetv == pred))
    all_loss += float(loss) * int(xv.shape[0])
  accuracy = float(hit) / tot
  all_loss /= tot
  if (accuracy > bst_acc):
    bst_acc = accuracy
    state = {'model_state': myModel.state_dict(), 'epoch': i, 'optim_state': optimizer.state_dict(), 'acc': bst_acc}
    torch.save(state, save_path)
  logger.info('After epoch %d, acc=%.4f, loss=%.4f' % (i, accuracy, all_loss))
logger.info("Finish")