Adapted from [this example script](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py)

In [1]:
# Local network environment settings
import os
os.environ["http_proxy"] = "127.0.0.1:11233"
os.environ["https_proxy"] = "127.0.0.1:11233"

Logger settings and Constants: 

In [2]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("regressor")

FP16 = False
BATCH_SIZE = 32
SEED = 42
WARMUP_PROPORTION = 0.1
PYTORCH_PRETRAINED_BERT_CACHE = "/mnt/Intel/bert_tmp"
LOSS_SCALE = 0. # Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
MAX_SEQ_LENGTH = 100

DATA_PATH = "douban_ratings.csv"

## Imports

In [3]:
import gc

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.modeling import PreTrainedBertModel, BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear, SCHEDULES
from fastprogress import master_bar, progress_bar
from sklearn.model_selection import StratifiedShuffleSplit

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
    device, n_gpu, FP16))

02/09/2019 12:44:10 - INFO - regressor -   device: cuda n_gpu: 1, 16-bits training: False


Set random seeds:

In [5]:
import random
import numpy as np

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

## Definitions

Regression model:

In [6]:
class BertForSequenceRegression(PreTrainedBertModel):
    def __init__(self, config):
        super(BertForSequenceRegression, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.apply(self.init_bert_weights)
        self.loss_fct = torch.nn.MSELoss()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, targets=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        outputs = self.regressor(pooled_output).clamp(-1, 1)
        if targets is not None:
            loss = self.loss_fct(outputs.view(-1), targets.view(-1))
            return loss
        else:
            return outputs

Data Classes:

In [7]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text, target=None):
        self.guid = guid
        self.text = text
        self.target = target


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

Data Processing Class and Function:

In [8]:
class DoubanRatingProcessor:
    """Processor for the Douban movie ratings data set."""
    def __init__(self, sample_ratio: float = 0.05):
        df_ratings = self.filter_entries(pd.read_csv(DATA_PATH)).sample(frac=sample_ratio)
        df_ratings["rating"] = ((df_ratings["rating"] - 3) / 2).astype("float32")
        assert df_ratings.rating.max() <= 1
        assert df_ratings.rating.min() >= -1
        assert df_ratings.isnull().sum().sum() == 0
        texts = df_ratings["comment"].values
        # Split the dataset
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=888)
        train_idx, test_idx = next(sss.split(df_ratings, df_ratings.rating))
        texts_train, texts_test = texts[train_idx], texts[test_idx]
        y_train = df_ratings.iloc[train_idx][["rating"]].copy().values
        y_test = df_ratings.iloc[test_idx][["rating"]].copy().values
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=888)
        val_idx, test_idx = next(sss.split(y_test, y_test))
        texts_valid, texts_test = texts_test[val_idx], texts_test[test_idx]
        y_valid, y_test = y_test[val_idx], y_test[test_idx]
        self.x_train, self.x_valid, self.x_test = (
            texts_train, texts_valid, texts_test)
        self.y_train, self.y_valid, self.y_test = (
            y_train, y_valid, y_test)

    @classmethod
    def filter_entries(cls, df_ratings, min_len=3, max_len=1000):
        lengths = df_ratings.comment.str.len()
        flags = (lengths >= min_len) & (lengths <= max_len)
        assert flags.isnull().sum() == 0
        return df_ratings.loc[flags].copy()
        
    def get_train_examples(self):
        return self._create_examples(self.x_train, self.y_train)

    def get_dev_examples(self):
        return self._create_examples(self.x_valid, self.y_valid)

    def get_test_examples(self):
        return self._create_examples(self.x_test, self.y_test)
    
    def _create_examples(self, x, y):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, (texts, target)) in enumerate(zip(x, y)):
            examples.append(
                InputExample(guid=i, text=texts, target=target))
        return examples

In [9]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)
        
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("target: %s" % (example.target))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              target=example.target))
    return features

In [10]:
class FreezableBertAdam(BertAdam):
    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    continue
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        return lr    

Utility functions:

In [11]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [12]:
def count_model_parameters(model):
    logger.info(
        "# of paramters: {:,d}".format(
            sum(p.numel() for p in model.parameters())))
    logger.info(
        "# of trainable paramters: {:,d}".format(
            sum(p.numel() for p in model.parameters() if p.requires_grad)))

## Training

### Preprocessing

In [13]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-chinese", do_lower_case=True, 
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

02/09/2019 12:44:12 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /mnt/Intel/bert_tmp/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [14]:
train_examples = DoubanRatingProcessor().get_train_examples()

In [15]:
train_features = convert_examples_to_features(
    train_examples, MAX_SEQ_LENGTH, tokenizer)
del train_examples
gc.collect()

02/09/2019 12:44:16 - INFO - regressor -   *** Example ***
02/09/2019 12:44:16 - INFO - regressor -   guid: 0
02/09/2019 12:44:16 - INFO - regressor -   tokens: [CLS] 影 片 只 体 现 了 他 们 共 同 跨 越 赛 场 上 的 障 碍 ， 生 活 中 的 镜 头 几 乎 全 给 了 pierre 。 去 戏 剧 化 和 小 细 节 跟 体 育 类 型 有 点 不 兼 容 。 两 小 时 的 片 长 剧 情 和 节 奏 安 排 得 也 不 妥 帖 。 [SEP]
02/09/2019 12:44:16 - INFO - regressor -   input_ids: 101 2512 4275 1372 860 4385 749 800 812 1066 1398 6659 6632 6612 1767 677 4638 7397 4809 8024 4495 3833 704 4638 7262 1928 1126 725 1059 5314 749 8744 511 1343 2767 1196 1265 1469 2207 5301 5688 6656 860 5509 5102 1798 3300 4157 679 1076 2159 511 697 2207 3198 4638 4275 7270 1196 2658 1469 5688 1941 2128 2961 2533 738 679 1980 2365 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/09/2019 12:44:16 - INFO - regressor -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0

### Model definition

In [16]:
# Prepare model
model = BertForSequenceRegression.from_pretrained(
    "bert-base-chinese",
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
if FP16:
    model.half()
model.to(device)

02/09/2019 12:44:31 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz from cache at /mnt/Intel/bert_tmp/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f
02/09/2019 12:44:31 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /mnt/Intel/bert_tmp/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f to temp dir /tmp/tmpw3rlljf2
02/09/2019 12:44:33 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "poo

BertForSequenceRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediat

In [17]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [18]:
def get_optimizer(num_train_optimization_steps: int, learning_rate: float):
    grouped_parameters = [
       x for x in optimizer_grouped_parameters if any([p.requires_grad for p in x["params"]])
    ]
    for group in grouped_parameters:
        group['lr'] = learning_rate
    if FP16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(grouped_parameters,
                              lr=learning_rate, bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=LOSS_SCALE)

    else:
        optimizer = FreezableBertAdam(grouped_parameters,
                             lr=learning_rate, warmup=WARMUP_PROPORTION,
                             t_total=num_train_optimization_steps)
    return optimizer

### The Training Loop

In [19]:
def train(model: nn.Module, num_epochs: int, learning_rate: float):
    num_train_optimization_steps = len(train_dataloader) * num_epochs 
    optimizer = get_optimizer(num_train_optimization_steps, learning_rate)
    assert all([x["lr"] == learning_rate for x in optimizer.param_groups])
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_features))
    logger.info("  Batch size = %d", BATCH_SIZE)
    logger.info("  Num steps = %d", num_train_optimization_steps)    
    model.train()
    mb = master_bar(range(num_epochs))
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0    
    for _ in mb:
        for step, batch in enumerate(progress_bar(train_dataloader, parent=mb)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, target = batch
            loss = model(input_ids, segment_ids, input_mask, target)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.

            if FP16:
                optimizer.backward(loss)
            else:
                loss.backward()

            if tr_loss == 0:
                tr_loss = loss.item()
            else:
                tr_loss = tr_loss * 0.9 + loss.item() * 0.1
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if FP16:
                # modify learning rate with special warm up BERT uses
                # if args.fp16 is False, BertAdam is used that handles this automatically
                lr_this_step = (
                     LR * warmup_linear(global_step/num_train_optimization_steps, WARMUP_PROPORTION))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            mb.child.comment = f'loss: {tr_loss:.4f} lr: {optimizer.get_lr()[0]:.2E}'
    logger.info("  train loss = %.4f", tr_loss) 
    return tr_loss

In [20]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in train_features], dtype=torch.float)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

In [21]:
# Train only the "pooler" and the final linear layer
set_trainable(model, True)
set_trainable(model.bert.embeddings, False)
set_trainable(model.bert.encoder, False)
count_model_parameters(model)
train(model, num_epochs = 2, learning_rate = 5e-4)

02/09/2019 12:44:37 - INFO - regressor -   # of paramters: 102,268,417
02/09/2019 12:44:37 - INFO - regressor -   # of trainable paramters: 591,361
02/09/2019 12:44:37 - INFO - regressor -   ***** Running training *****
02/09/2019 12:44:37 - INFO - regressor -     Num examples = 86871
02/09/2019 12:44:37 - INFO - regressor -     Batch size = 32
02/09/2019 12:44:37 - INFO - regressor -     Num steps = 5430


02/09/2019 13:00:54 - INFO - regressor -     train loss = 0.1961


0.1960972243634961

In [22]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = "./regressor_stage1.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [26]:
model.load_state_dict(torch.load(output_model_file))

In [27]:
gc.collect()

446

In [28]:
# Train the last two layer, too
set_trainable(model.bert.encoder.layer[11], True)
set_trainable(model.bert.encoder.layer[10], True)
count_model_parameters(model)
train(model, num_epochs = 2, learning_rate = 5e-5)

02/09/2019 13:28:16 - INFO - regressor -   # of paramters: 102,268,417
02/09/2019 13:28:16 - INFO - regressor -   # of trainable paramters: 14,767,105
02/09/2019 13:28:16 - INFO - regressor -   ***** Running training *****
02/09/2019 13:28:16 - INFO - regressor -     Num examples = 86871
02/09/2019 13:28:16 - INFO - regressor -     Batch size = 32
02/09/2019 13:28:16 - INFO - regressor -     Num steps = 5430


02/09/2019 13:50:25 - INFO - regressor -     train loss = 0.2017


0.20167179324832113

In [29]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = "./regressor_stage2.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [32]:
# Train all layers
set_trainable(model, True)
count_model_parameters(model)
train(model, num_epochs = 1, learning_rate = 1e-5)

02/09/2019 13:55:17 - INFO - regressor -   # of paramters: 102,268,417
02/09/2019 13:55:17 - INFO - regressor -   # of trainable paramters: 102,268,417
02/09/2019 13:55:17 - INFO - regressor -   ***** Running training *****
02/09/2019 13:55:17 - INFO - regressor -     Num examples = 86871
02/09/2019 13:55:17 - INFO - regressor -     Batch size = 32
02/09/2019 13:55:17 - INFO - regressor -     Num steps = 2715


02/09/2019 14:22:24 - INFO - regressor -     train loss = 0.1713


0.17134070469037616

In [None]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = "./regressor_stage3.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
# Save a trained model
# model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
# output_model_file = "./pytorch_model.bin"
# torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned
# model_state_dict = torch.load(output_model_file)
# model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
# model.to(device)

In [None]:
del train_features
gc.collect()

## Evauluation

In [30]:
eval_examples = DoubanRatingProcessor().get_dev_examples()
eval_features = convert_examples_to_features(
    eval_examples, MAX_SEQ_LENGTH, tokenizer)

02/09/2019 13:51:16 - INFO - regressor -   *** Example ***
02/09/2019 13:51:16 - INFO - regressor -   guid: 0
02/09/2019 13:51:16 - INFO - regressor -   tokens: [CLS] 舍 不 得 辜 负 这 段 相 遇 ， 但 继 续 下 去 是 一 个 太 大 的 决 定 。 我 努 力 过 了 ， 不 喜 欢 你 ， 真 希 望 能 结 束 于 谈 天 说 地 。 怪 这 个 城 市 太 快 吧 ， 在 世 界 的 其 他 地 方 我 们 还 生 活 在 昨 天 。 [SEP]
02/09/2019 13:51:16 - INFO - regressor -   input_ids: 101 5650 679 2533 6790 6566 6821 3667 4685 6878 8024 852 5326 5330 678 1343 3221 671 702 1922 1920 4638 1104 2137 511 2769 1222 1213 6814 749 8024 679 1599 3614 872 8024 4696 2361 3307 5543 5310 3338 754 6448 1921 6432 1765 511 2597 6821 702 1814 2356 1922 2571 1416 8024 1762 686 4518 4638 1071 800 1765 3175 2769 812 6820 4495 3833 1762 3219 1921 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/09/2019 13:51:16 - INFO - regressor -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0

In [33]:
logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", BATCH_SIZE * 5)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in eval_features], dtype=torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE * 5)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

mb = progress_bar(eval_dataloader)
for input_ids, input_mask, segment_ids, targets in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    targets = targets.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids, segment_ids, input_mask, targets)
        # outputs = model(input_ids, segment_ids, input_mask)

    # outputs = outputs.detach().cpu().numpy()
    # targets = targets.to('cpu').numpy()
    # tmp_eval_accuracy = accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    # eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1
    mb.comment = f'{eval_loss / nb_eval_steps:.4f}'

eval_loss / nb_eval_steps

02/09/2019 14:25:07 - INFO - regressor -   ***** Running evaluation *****
02/09/2019 14:25:07 - INFO - regressor -     Num examples = 28957
02/09/2019 14:25:07 - INFO - regressor -     Batch size = 160


0.16707682765979134