Adapted from [this example script](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py)

In [1]:
import os
os.environ["http_proxy"] = "127.0.0.1:11233"
os.environ["https_proxy"] = "127.0.0.1:11233"

In [2]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("regressor")

FP16 = False
BATCH_SIZE = 20
SEED = 42
TRAIN_EPOCHS = 3
WARMUP_PROPORTION = 0.1
PYTORCH_PRETRAINED_BERT_CACHE = "/mnt/Intel/bert_tmp"
LOSS_SCALE = 0. # Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
LR = 5e-5
MAX_SEQ_LENGTH = 100

In [40]:
import torch
import torch.nn as nn
import pandas as pd
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.modeling import PreTrainedBertModel, BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
from fastprogress import master_bar, progress_bar
from sklearn.model_selection import StratifiedShuffleSplit

In [4]:
DATA_PATH = "douban_ratings.csv"

In [5]:
class BertForSequenceRegression(PreTrainedBertModel):
    def __init__(self, config):
        super(BertForSequenceRegression, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.apply(self.init_bert_weights)
        self.loss_fct = torch.nn.MSELoss()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, targets=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        outputs = self.regressor(pooled_output).clamp(-1, 1)
        if targets is not None:
            loss = self.loss_fct(outputs.view(-1), targets.view(-1))
            return loss
        else:
            return outputs

In [6]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text, target=None):
        self.guid = guid
        self.text = text
        self.target = target


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [7]:
class DoubanRatingProcessor:
    """Processor for the Douban movie ratings data set."""
    def __init__(self):
        df_ratings = self.filter_entries(pd.read_csv(DATA_PATH)).sample(frac=0.25)
        df_ratings["rating"] = ((df_ratings["rating"] - 3) / 2).astype("float32")
        assert df_ratings.rating.max() <= 1
        assert df_ratings.rating.min() >= -1
        assert df_ratings.isnull().sum().sum() == 0
        texts = df_ratings["comment"].values
        # Split the dataset
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=888)
        train_idx, test_idx = next(sss.split(df_ratings, df_ratings.rating))
        texts_train, texts_test = texts[train_idx], texts[test_idx]
        y_train = df_ratings.iloc[train_idx][["rating"]].copy().values
        y_test = df_ratings.iloc[test_idx][["rating"]].copy().values
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=888)
        val_idx, test_idx = next(sss.split(y_test, y_test))
        texts_valid, texts_test = texts_test[val_idx], texts_test[test_idx]
        y_valid, y_test = y_test[val_idx], y_test[test_idx]
        self.x_train, self.x_valid, self.x_test = (
            texts_train, texts_valid, texts_test)
        self.y_train, self.y_valid, self.y_test = (
            y_train, y_valid, y_test)

    @classmethod
    def filter_entries(cls, df_ratings, min_len=3, max_len=1000):
        lengths = df_ratings.comment.str.len()
        flags = (lengths >= min_len) & (lengths <= max_len)
        assert flags.isnull().sum() == 0
        return df_ratings.loc[flags].copy()
        
    def get_train_examples(self):
        return self._create_examples(self.x_train, self.y_train)

    def get_dev_examples(self):
        return self._create_examples(self.x_valid, self.y_valid)

    def get_test_examples(self):
        return self._create_examples(self.x_test, self.y_test)
    
    def _create_examples(self, x, y):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, (texts, target)) in enumerate(zip(x, y)):
            examples.append(
                InputExample(guid=i, text=texts, target=target))
        return examples

In [8]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)
        
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("target: %s" % (example.target))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              target=example.target))
    return features

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
    device, n_gpu, FP16))

02/08/2019 18:03:28 - INFO - regressor -   device: cuda n_gpu: 1, 16-bits training: False


In [10]:
import random
import numpy as np

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [11]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-chinese", do_lower_case=True, 
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

02/08/2019 18:03:34 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /mnt/Intel/bert_tmp/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [12]:
train_examples = DoubanRatingProcessor().get_train_examples()
num_train_optimization_steps = int(
    len(train_examples) / BATCH_SIZE) * TRAIN_EPOCHS

In [29]:
# Prepare model
model = BertForSequenceRegression.from_pretrained(
    "bert-base-chinese",
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
if FP16:
    model.half()
model.to(device)

02/08/2019 18:18:40 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz from cache at /mnt/Intel/bert_tmp/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f
02/08/2019 18:18:40 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /mnt/Intel/bert_tmp/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f to temp dir /tmp/tmps99ramb8
02/08/2019 18:18:44 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "poo

BertForSequenceRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediat

In [30]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [31]:
if FP16:
    try:
        from apex.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex "
                          "to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=LR, bias_correction=False,
                          max_grad_norm=1.0)
    if args.loss_scale == 0:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=LOSS_SCALE)

else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=LR, warmup=WARMUP_PROPORTION,
                         t_total=num_train_optimization_steps)

In [16]:
train_features = convert_examples_to_features(
    train_examples, MAX_SEQ_LENGTH, tokenizer)

02/08/2019 18:03:52 - INFO - regressor -   *** Example ***
02/08/2019 18:03:52 - INFO - regressor -   guid: 0
02/08/2019 18:03:52 - INFO - regressor -   tokens: [CLS] 可 能 是 由 于 才 看 过 我 的 抗 战 的 缘 故 感 受 到 战 争 的 深 刻 [SEP]
02/08/2019 18:03:52 - INFO - regressor -   input_ids: 101 1377 5543 3221 4507 754 2798 4692 6814 2769 4638 2834 2773 4638 5357 3125 2697 1358 1168 2773 751 4638 3918 1174 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/08/2019 18:03:52 - INFO - regressor -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/08/2019 18:03:52 - INFO - regressor -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [17]:
del train_examples
import gc
gc.collect()

20

In [32]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_features))
logger.info("  Batch size = %d", BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in train_features], dtype=torch.float)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

model.train()
mb = master_bar(range(TRAIN_EPOCHS))
for _ in mb:
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(progress_bar(train_dataloader, parent=mb)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, target = batch
        loss = model(input_ids, segment_ids, input_mask, target)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.

        if FP16:
            optimizer.backward(loss)
        else:
            loss.backward()

        if tr_loss == 0:
            tr_loss = loss.item()
        else:
            tr_loss = tr_loss * 0.9 + loss.item() * 0.1
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if FP16:
            # modify learning rate with special warm up BERT uses
            # if args.fp16 is False, BertAdam is used that handles this automatically
            lr_this_step = (
                 LR * warmup_linear(global_step/num_train_optimization_steps, WARMUP_PROPORTION))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        mb.child.comment = f'loss: {tr_loss:.4f}'

02/08/2019 18:19:01 - INFO - regressor -   ***** Running training *****
02/08/2019 18:19:01 - INFO - regressor -     Num examples = 434359
02/08/2019 18:19:01 - INFO - regressor -     Batch size = 20
02/08/2019 18:19:01 - INFO - regressor -     Num steps = 65151


KeyboardInterrupt: 

In [33]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = "./pytorch_model.bin"
torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned
# model_state_dict = torch.load(output_model_file)
# model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
# model.to(device)

In [35]:
del train_features
gc.collect()

2173658

In [36]:
eval_examples = DoubanRatingProcessor().get_dev_examples()
eval_features = convert_examples_to_features(
    eval_examples, MAX_SEQ_LENGTH, tokenizer)

02/08/2019 19:03:23 - INFO - regressor -   *** Example ***
02/08/2019 19:03:23 - INFO - regressor -   guid: 0
02/08/2019 19:03:23 - INFO - regressor -   tokens: [CLS] 前 两 支 不 错 ， 印 度 篇 略 尴 尬 。 贾 樟 柯 那 支 关 照 的 是 他 2000 年 拍 摄 的 展 现 80 年 代 风 貌 的 《 站 台 》 ， 同 样 是 平 遥 古 城 ， 文 工 团 换 成 了 演 出 队 ， 还 是 梁 景 东 和 赵 涛 ， 像 是 很 多 年 以 后 张 军 和 尹 瑞 娟 搭 伴 过 起 了 日 子 。 《 站 台 》 里 计 划 生 育 [SEP]
02/08/2019 19:03:23 - INFO - regressor -   input_ids: 101 1184 697 3118 679 7231 8024 1313 2428 5063 4526 2219 2217 511 6593 3562 3392 6929 3118 1068 4212 4638 3221 800 8202 2399 2864 3029 4638 2245 4385 8188 2399 807 7599 6505 4638 517 4991 1378 518 8024 1398 3416 3221 2398 6898 1367 1814 8024 3152 2339 1730 2940 2768 749 4028 1139 7339 8024 6820 3221 3448 3250 691 1469 6627 3875 8024 1008 3221 2523 1914 2399 809 1400 2476 1092 1469 2222 4448 2026 3022 845 6814 6629 749 3189 2094 511 517 4991 1378 518 7027 6369 1153 4495 5509 102
02/08/2019 19:03:23 - INFO - regressor -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [53]:
logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", BATCH_SIZE * 5)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in eval_features], dtype=torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE * 5)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

mb = progress_bar(eval_dataloader)
for input_ids, input_mask, segment_ids, targets in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    targets = targets.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids, segment_ids, input_mask, targets)
        outputs = model(input_ids, segment_ids, input_mask)

    outputs = outputs.detach().cpu().numpy()
    targets = targets.to('cpu').numpy()
    # tmp_eval_accuracy = accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    # eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1
    mb.comment = f'{eval_loss / nb_eval_steps:.4f}'

eval_loss = eval_loss / nb_eval_steps
# eval_accuracy = eval_accuracy / nb_eval_examples
loss = tr_loss/nb_tr_steps if args.do_train else None
result = {'eval_loss': eval_loss,
          # 'eval_accuracy': eval_accuracy,
          'global_step': global_step,
          'loss': loss}

02/08/2019 19:16:51 - INFO - regressor -   ***** Running evaluation *****
02/08/2019 19:16:51 - INFO - regressor -     Num examples = 144786
02/08/2019 19:16:51 - INFO - regressor -     Batch size = 100


KeyboardInterrupt: 