In [1]:
import logging
import os
import sys
import random
import pickle
from tqdm import tqdm_notebook as tqdm 
from tqdm import tnrange as trange

import numpy as np

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss

from tensorboardX import SummaryWriter

from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
logger = logging.getLogger(__name__)
import sys
import csv
csv.field_size_limit(sys.maxsize)

131072

In [10]:
data_dir = "/ssd2/arthur/TREC2019/data/"
bert_model="bert-base-uncased"
output_dir = "/ssd2/arthur/TREC2019/data/bert-output/"
max_seq_len=512
local_rank = -1
device = torch.device('cuda')
n_gpu = 2
train_batch_size = 32
task_name = "msmarco_2"
num_train_epochs = 1
learning_rate = 5e-5
warmup_proportion = 0.1
eval_batch_size = 8

In [4]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [2]:
import csv
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()




class InputExample:
    def __init__(self, guid, text_a, text_b, label):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

class Processor():
    def _read_tsv(self, input_file):
        with open(input_file, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter = '\t')
            lines = []
            for line in reader:
                lines.append(line)
                if len(lines) > 10000:
                    return lines
            return lines
    
    def get_train_examples(self, data_dir):
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train-samples.tsv")), "train")
    
    def get_dev_examples(self, data_dir):
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev-samples.tsv")), "dev")
    
    def _create_examples(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            try:
                guid = "%s-%s" % (set_type, line[0])
                query = line[1]
                doc = line[2]
                label = line[-1]
                examples.append(InputExample(guid=guid, text_a=query, text_b=doc, label=label))
            except:
                print(i, line)
                continue
        return examples
processor = Processor()
output_mode = "classification"
label_list = ['1','0']
num_labels = 2

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_mode):
    label_map = {label: i for i, label in enumerate(label_list)}
    features= []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = tokenizer.tokenize(example.text_b)
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        tokens = ["[CLS]"] + tokens_a +["[SEP]"] + tokens_b + ["[SEP]"]
        
        segment_ids = [0] * (len(tokens_a) +2)
        segment_ids += [1] * (len(tokens_b) +1)
        
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)
        
        if ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features
def simple_accuracy(preds, labels):
    return (preds == labels).mean()
def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return {"acc": simple_accuracy(preds, labels)}

In [5]:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)

In [6]:
model.to(device)
model = torch.nn.DataParallel(model)

In [11]:
global_step = 0 
nb_tr_steps = 0
tr_loss = 0
tb_writter = SummaryWriter()
train_examples = processor.get_train_examples(data_dir)

#Preprocess inputs as input_ids (tokens ids), input_mask (for masking trainning), segment_id (id for first and second segment) and label_id
cached_train_features_file = os.path.join(data_dir, 'train_{}_{}_{}'.format(list(filter(None, bert_model.split('/'))).pop(), str(max_seq_len), str(task_name)))
try:
    with open(cached_train_features_file, "rb") as reader:
        train_features = pickle.load(reader)
except:
    train_features = convert_examples_to_features(train_examples, label_list, max_seq_len, tokenizer, output_mode)
    with open(cached_train_features_file, "wb") as writer:
        pickle.dump(train_features, writer)


In [12]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

In [13]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) #loader for datasets, will yield one sample at a time
train_sampler = RandomSampler(train_data) #randomly pick one sample
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = train_batch_size) #join both as a single loader
num_train_optimization_steps = len(train_dataloader) *3 #number of optimization rounds to be completed

In [14]:
param_optimizer = list(model.named_parameters()) #params to be optimized
no_decay =['bias', 'LayerNorm.bias', 'LayerNorm.weight'] #params that the learning rate should not be decayed over training steps
optimizer_grouped_params = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.01}, #rate do decay params
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}] #params that shoud not decay
optimizer = BertAdam(optimizer_grouped_params, lr=learning_rate, warmup=warmup_proportion, t_total =num_train_optimization_steps)

In [15]:
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"): #for each epoch
    tr_loss = 0 #training loss for this epoch
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch) #send this batch to gpu. Each size of batch is [batch_size, max_tokens]. Labels is just [batch_size]
        input_ids, input_mask, segment_ids, label_ids = batch
        
        logits = model(input_ids, token_type_ids=segment_ids, attention_mask = input_mask) #model output. sized [batch_size, n_classes]
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        loss = loss.mean() #average over all gpus
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples+=input_ids.size(0)
        nb_tr_steps += 1
        optimizer.step()
        optimizer.zero_grad()
        global_step +=1
        tb_writter.add_scalar('lr', optimizer.get_lr()[0], global_step)
        tb_writter.add_scalar('loss', loss.item(), global_step)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=313, style=ProgressStyle(description_width='i…




In [16]:
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

'/ssd2/arthur/TREC2019/data/bert-output/vocab.txt'

In [20]:
model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)

In [21]:
model.to(device)
eval_examples = processor.get_dev_examples(data_dir)
cached_eval_features_file = os.path.join(data_dir, 'dev_{}_{}_{}'.format(list(filter(None, bert_model.split('/'))).pop(), str(max_seq_len), str(task_name)))

In [23]:
try:
    with open(cached_eval_features_file, 'rb') as reader:
        eval_features = pickle.load(reader)
except:
    eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_len, tokenizer, 'classification')
    with open(cached_eval_features_file, 'wb') as writer:
        pickle.dump(eval_features, writer)

In [36]:
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

eval_sampler = SequentialSampler(eval_data)

eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)


In [None]:
model.eval()
eval_loss = 0
nb_eval_steps =0 
preds = []
out_label_ids = None
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with torch.no_grad(): #do not compute gradients
        logits = model(input_ids, token_type_ids = segment_ids, attention_mask = input_mask)
    loss_fct = CrossEntropyLoss()
    tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
    
    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps+=1
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
        out_label_ids = label_ids.detach().cpu().numpy()
    else:
        preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

HBox(children=(IntProgress(value=0, description='Evaluating', max=1286, style=ProgressStyle(description_width=…

In [42]:
eval_loss = eval_loss / nb_eval_steps

preds = preds[0]
preds = np.argmax(preds, axis=1)
result = compute_metrics(task_name, preds, out_label_ids)
loss= tr_loss/global_step 
result['eval_loss'] = eval_loss
result['global_step'] = global_step
result['loss'] = loss

output_eval_file = os.path.join(output_dir, "eval_results.txt")
with open(output_eval_file, 'w') as writer:
    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))


AxisError: axis 1 is out of bounds for array of dimension 1

In [43]:
preds

0