<a href="https://colab.research.google.com/github/Foruck/sentiment-analysis-demo/blob/master/bert_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving cn_train_data.h5 to cn_train_data.h5
Saving cn_valid_data.h5 to cn_valid_data.h5
Saving en_train_data.h5 to en_train_data.h5
Saving en_valid_data.h5 to en_valid_data.h5
User uploaded file "cn_train_data.h5" with length 47004264 bytes
User uploaded file "cn_valid_data.h5" with length 24217616 bytes
User uploaded file "en_train_data.h5" with length 18164752 bytes
User uploaded file "en_valid_data.h5" with length 9358904 bytes


In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision\

tcmalloc: large alloc 1073750016 bytes == 0x59250000 @  0x7f32cee242a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


In [0]:
!pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/95/68/84de54aea460eb5b2e90bf47a429aacc1ce97ff052ec40874ea38ae2331d/pytorch_pretrained_bert-0.4.0-py3-none-any.whl (45kB)
[K    22% |███████▎                        | 10kB 18.1MB/s eta 0:00:01[K    45% |██████████████▌                 | 20kB 1.5MB/s eta 0:00:01[K    68% |█████████████████████▊          | 30kB 2.3MB/s eta 0:00:01[K    90% |█████████████████████████████   | 40kB 1.5MB/s eta 0:00:01[K    100% |████████████████████████████████| 51kB 1.8MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.4.0


In [0]:
import sys
import os
import datetime
import logging
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.utils.data
import h5py
import numpy as np
from torch.autograd import Variable
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertAdam

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [0]:
LSTM_HIDDEN_SIZE = 256
embedding_length = 768
Sentence_Max_Length = 128

In [0]:
class bertDataset(torch.utils.data.Dataset):
  def __init__(self, data_path):
    with h5py.File(data_path, 'r') as f:
      self.data = f['data'][:, :]
      self.mask = f['mask'][:, :]
      self.annot = f['annot'][:]
    
  def __len__(self):
    return self.annot.shape[0]
    
  def __getitem__(self, idx):
    return self.data[idx, :], self.mask[idx, :], self.annot[idx]

      
class myBert(torch.nn.Module):
  def __init__(self, embedding_length=768, bert_path='bert-base-uncased', window=[7], classes=2, use_cuda=True):
    super(myBert, self).__init__()
    self.use_cuda = use_cuda
    self.num_filter = len(window)

    # Bert model
    self.bert = BertModel.from_pretrained(bert_path)

    # Convolution layers
    conv1, conv2, conv3, conv4 = [], [], [], []
    for i in range(self.num_filter):
      conv1.append(nn.Conv2d(1, 256, (window[i], embedding_length), stride=(1, 1), padding=(int((window[i] - 1) / 2), 0))) # out n*64*128*1
      conv2.append(nn.Conv1d(256, 128, 3, stride=1, padding=1)) # out n*128*64
      conv3.append(nn.Conv1d(128, 64, 3, stride=1, padding=1)) # out n*64*32
      conv4.append(nn.Conv1d(64, 16, 1, stride=1, padding=0)) # out n*16*16
    self.conv1, self.conv2 = nn.ModuleList(conv1), nn.ModuleList(conv2)
    self.conv3, self.conv4 = nn.ModuleList(conv3), nn.ModuleList(conv4)
    for i in range(self.num_filter):
      init.kaiming_normal_(self.conv1[i].weight.data)
      init.kaiming_normal_(self.conv2[i].weight.data)
      init.kaiming_normal_(self.conv3[i].weight.data)
      init.kaiming_normal_(self.conv4[i].weight.data)
    self.dropout = nn.Dropout(p=0.5)

    # LSTM layers
    self.lstm1 = nn.LSTMCell(embedding_length, LSTM_Hidden_Size) # out n*1*LSTM_Hidden_Size
    self.lstm2 = nn.LSTMCell(embedding_length, LSTM_Hidden_Size)

    # FC layer
    self.fc = nn.Linear(LSTM_Hidden_Size * 2 + 256 * self.num_filter, classes)
    init.kaiming_normal_(self.fc.weight.data)
    self.fc.bias.data.fill_(0)
    
  def forward(self, inputs, mask):
    # Get Features
    inputs = self.bert(inputs, token_type_ids=None, attention_mask=mask, output_all_encoded_layers=False)[0]
    inputs = inputs.unsqueeze(1)
        
    # Go through Bi-LSTM
    n = inputs.shape[0]
    x0 = inputs.squeeze(1)
    if next(self.parameters()).is_cuda and self.use_cuda:
      if not inputs.is_cuda:
        inputs = inputs.cuda()
        x0 = x0.cuda()
        cx1 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hx1 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        cx2 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hx2 = torch.zeros(n, LSTM_Hidden_Size).cuda()
        hxs1 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size)).cuda()
        hxs2 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size)).cuda()
      else:
        if inputs.is_cuda:
          inputs = inputs.cpu()
          x0 = x0.cpu()
        cx1 = torch.zeros(n, LSTM_Hidden_Size)
        hx1 = torch.zeros(n, LSTM_Hidden_Size)
        cx2 = torch.zeros(n, LSTM_Hidden_Size)
        hx2 = torch.zeros(n, LSTM_Hidden_Size)
        hxs1 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size))
        hxs2 = torch.zeros((n, x0.shape[1], LSTM_Hidden_Size))

    for i in range(x0.shape[1]):
      hx1, cx1 = self.lstm1(x0[:, i, :], (hx1, cx1))
      hxs1[:, i, :] = hx1
      hx2, cx2 = self.lstm1(x0[:, x0.shape[1] - 1 - i, :], (hx2, cx2))
      hxs2[:, i, :] = hx2
    lstm_x = torch.cat((torch.mean(hxs1, 1, True).squeeze(1), torch.mean(hxs2, 1, True).squeeze(1)), 1)

    # Go Through CNN
    x = []
    for i in range(self.num_filter):
      x.append(self.conv1[i](inputs))
      x[i] = x[i].squeeze(3)
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv2[i](x[i])
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv3[i](x[i])
      x[i] = F.relu(F.max_pool1d(x[i], kernel_size=2, stride=2))
      x[i] = self.conv4[i](x[i])
      x[i] = x[i].view(n, -1)
      x[i] = self.dropout(x[i])
        
    cnn_x = torch.cat(x, 1)
    x1 = torch.cat((lstm_x, cnn_x), 1)
    x1 = self.dropout(self.fc(x1))
    return x1

In [0]:
def train(model, train_loader, optimizer, logger, epoch=0, print_every=100):
  model = model.train()
  loss_fn = nn.CrossEntropyLoss()
  all_loss, all_accuracy = 0.0, 0.0
  hit, tot, cnt = 0, 0, 0
  for i, (x, mask, target) in enumerate(train_loader):
    x = x.cuda().long()
    mask = mask.cuda().long()
    target = target.cuda().long()
    target = torch.clamp(target, min=0, max=1)
    
    scores = model(x, mask)
    loss = loss_fn(scores, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    pred = torch.argmax(scores, dim=1)
    hit = torch.sum(target == pred)
    
    accuracy = float(hit) / int(x.shape[0])
    if i % print_every == 0:
      logger.info('Epoch %d, Iter %d, loss=%.4f, acc=%.4f' % (epoch, i, loss, accuracy))
    all_loss += float(loss) * int(tot)
    all_accuracy += float(hit)
    cnt += int(x.shape[0])
    
  all_loss /= cnt
  all_accuracy /= cnt
  logger.info('Epoch %d, train_loss=%.4f, train_accuracy%.4f' % (epoch, all_loss, all_accuracy))
  return model, optimizer

def evaluate(model, valid_loader, logger, epoch=0, print_every=100):
  model = model.eval()
  loss_fn = nn.CrossEntropyLoss()
  all_loss = 0.0
  hit, tot = 0, 0
  for i, (x, mask, target) in enumerate(valid_loader):
    x = x.cuda().long()
    mask = mask.cuda().long()
    target = target.cuda().long()
    target = torch.clamp(target, min=0, max=1)
    
    scores = model(x, mask)
    loss = loss_fn(scores, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    pred = torch.argmax(scores, dim=1)
    
    hit += float(torch.sum(target == pred))
    all_loss += float(loss) * int(x.shape[0])
    tot += int(x.shape[0])
  
  all_loss /= tot
  accuracy = float(hit) / tot
  logger.info('Epoch %d, valid_loss=%.4f, valid_accuracy=%.4f' % (epoch, all_loss, accuracy))
  return model, accuracy

In [0]:
!mkdir log
!mkdir bst_model
!ls

bst_model  log	sample_data


In [0]:
torch.set_default_tensor_type('torch.FloatTensor')
device = torch.device("cuda")
end_epoch = 30
tag = 'en'
lr = 5e-5
checkpoint = ''
job_name =  '_'.join([tag, str(lr), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')])
save_path = 'bst_model/' + job_name + '.pth'
log_path = 'log/' + job_name + '.log'
bz = 32

In [0]:
if tag == 'cn':
  train_set = bertDataset('cn_train_data.h5')
  train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bz, shuffle=True, num_workers=8)
  valid_set = bertDataset('cn_valid_data.h5')
  valid_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=int(bz / 2), shuffle=True, num_workers=8)
  myModel = myBert(embedding_length, 'bert-base-chinese', use_cuda=args.gpu)
  myModel = myModel.to(device=device)
elif tag == 'en':
  train_set = bertDataset('en_train_data.h5')
  train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bz, shuffle=True, num_workers=8)
  valid_set = bertDataset('en_train_data.h5')
  valid_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=int(bz / 2), shuffle=True, num_workers=8)
  myModel = myBert(embedding_length, 'bert-base-uncased', use_cuda=args.gpu)
  myModel = myModel.to(device=device)
  
if checkpoint != '':
  state = torch.load(args.checkpoint)
  myModel.load_state_dict(state['model_state'])
  optimizer.load_state_dict(state['optim_state'])
  epoch = state['epoch']
  bst_acc = state['acc']
else:
  epoch = 0
  
ignored_params = list(map(id, myModel.bert.parameters()))
base_params = filter(lambda p: id(p) not in ignored_params, myModel.parameters())
optimizer_grouped_parameters = [{'params': base_params, 'weight_decay': 0.01}, {'params': myModel.bert.parameters(), 'weight_decay': 0.0, 'lr': 1e-6}]
optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup_proportion, t_total=train_set.__len__())

NameError: ignored

In [0]:
logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler(log_path)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("Start print log")

In [0]:
for i in range(epoch, end_epoch):
  logger.info('=> Epoch %d, lr = %0.6f <=' % (i, args.lr))
  myModel, optimizer = train(myModel, train_loader, optimizer, logger, epoch=i, print_every=50)
  myModel, accuracy = evaluate(myModel, valid_loader, logger, epoch=i)
  
  if (accuracy > bst_acc):
    bst_acc = accuracy
    state = {'model_state': myModel.state_dict(), 'epoch': i, 'optim_state': optimizer.state_dict(), 'acc': bst_acc}
    torch.save(state, save_path)
  
logger.info("Finish")