In [1]:
import os
from itertools import chain
import torch
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
from transformers import *
from xml.dom import minidom
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from apex import amp
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef, r2_score, roc_auc_score

In [2]:
class Tensorboard:
    def __init__(self, logdir):
        self.writer = tf.summary.create_file_writer(logdir)

    def close(self):
        self.writer.close()

    def log_scalar(self, tag, value, global_step):
        with self.writer.as_default():
            tf.summary.scalar(tag, value, global_step)

In [3]:
num_labels = 3
train_batch_size = 3
learn_rate = 1e-5
warmup_steps = 1024
accumulation_steps = 6
epochs = 16
pretrain = -1
warp = 0

In [4]:
EXPERIMENT = 'roberta_large_1e-5_mnli_en'
MODEL_NAME = 'roberta-large-mnli'
SEQUENCE_LENGTH = 512
TOKENIZER = RobertaTokenizer.from_pretrained(MODEL_NAME)
CONFIG = RobertaConfig.from_pretrained(MODEL_NAME, num_labels = num_labels)
MODEL = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config = CONFIG)
#MODEL.load_state_dict(torch.load('../input/pretrainedmtlm/checkpoints/model.torch', map_location='cpu'))

In [5]:
def load_semeval(t = 'train', lang = 'en'):
    semeval_keys = {
        'correct': 2,
        'incorrect': 1,
        'contradictory': 0
    }
    
    file = minidom.parse('datasets/semeval2013-3way-' + lang + '/' + t + '.xml')
    
    for exercise in file.getElementsByTagName('exercise'):
        for reference in exercise.getElementsByTagName('reference'):
            for answer in exercise.getElementsByTagName('answer'):
                yield (
                    reference.firstChild.data, 
                    answer.firstChild.data, 
                    semeval_keys[answer.attributes['accuracy'].value]
                )

def project_semeval(score):
    return {
        2: 1,
        1: 0,
        0: 0
    }[score]

In [6]:
def tokenize(loader):
    for r, a, l in loader:
        idx = TOKENIZER.encode(r, a, True)
        if len(idx) > SEQUENCE_LENGTH:
            continue
        
        mask = [1] * len(idx) + [0] * (SEQUENCE_LENGTH - len(idx))
        idx += [0] * (SEQUENCE_LENGTH - len(idx))

        yield idx, mask, l

In [7]:
def dataset(loader):
    features = list(loader)

    all_input_ids = torch.tensor([f[0] for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f[1] for f in features], dtype=torch.uint8)

    all_outputs = torch.tensor([f[2] for f in features], dtype=torch.uint8)

    return TensorDataset(all_input_ids, all_input_mask, all_outputs)

In [8]:
def save_model(model_cpu, step):
    output_dir = 'logs/%s' % EXPERIMENT
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    torch.save(model_cpu.state_dict(), os.path.join(output_dir, 'model_%d.torch' % step))

In [9]:
for t in [ 'train', 'unseen_answers', 'unseen_questions', 'unseen_domains' ]:
    c = 0
    for p in load_semeval(t, 'en'):
        c += 1
    print(t, c)

train 22167
unseen_answers 2402
unseen_questions 4632
unseen_domains 4562


In [10]:
train_dataset = dataset(tokenize(load_semeval('train', 'en')))
train_sampler = RandomSampler(train_dataset)
train_dataset = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size, drop_last=True)

val_datasets = {
    'unseen_answers_en': DataLoader(dataset(tokenize(load_semeval('unseen_answers', 'en'))), batch_size=32, drop_last=True),
    'unseen_questions_en': DataLoader(dataset(tokenize(load_semeval('unseen_questions', 'en'))), batch_size=32, drop_last=True),
    'unseen_domains_en': DataLoader(dataset(tokenize(load_semeval('unseen_domains', 'en'))), batch_size=32, drop_last=True),
    'unseen_answers_de': DataLoader(dataset(tokenize(load_semeval('unseen_answers', 'de'))), batch_size=32, drop_last=True),
    'unseen_questions_de': DataLoader(dataset(tokenize(load_semeval('unseen_questions', 'de'))), batch_size=32, drop_last=True),
    'unseen_domains_de': DataLoader(dataset(tokenize(load_semeval('unseen_domains', 'de'))), batch_size=32, drop_last=True)
}

In [11]:
tensorboard = Tensorboard('logs/%s' % EXPERIMENT)
model = MODEL.to('cuda')

model.zero_grad()
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learn_rate, eps=1e-8)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
scheduler = WarmupLinearSchedule(optimizer, 
                                 warmup_steps=warmup_steps, 
                                 t_total=len(train_dataset) // accumulation_steps * epochs)

for e in range(warp * (len(train_dataset) // accumulation_steps)):
    scheduler.step()

if pretrain > 0:
    for param in MODEL.transformer.parameters():
        param.requires_grad = False

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [12]:
for epoch in range(epochs):
    if pretrain == epoch:
        for param in MODEL.transformer.parameters():
            param.requires_grad = True
    
    model.train()
    for step, batch in enumerate(train_dataset):
        outputs = model(
            batch[0].long().to('cuda'), 
            attention_mask = batch[1].long().to('cuda'), 
            labels = batch[2].long().to('cuda')
        )
        
        loss = outputs[0].mean()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
            
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        
        tensorboard.log_scalar('loss', loss.item(), step * train_batch_size + len(train_dataset) * (epoch + warp) * train_batch_size)
        
        if (step + 1) % (len(train_dataset) / 4) == 0:
            print('Step {}/{}'.format(step, len(train_dataset)))
            
            save_model(MODEL, step * train_batch_size + len(train_dataset) * (epoch + warp) * train_batch_size)
            model.eval()
            scores = []
            for key, val_dataset in val_datasets.items():
                y_true = []
                y_pred = []
                y_orig = []

                for batch in val_dataset:
                    with torch.no_grad():
                        outputs = model(
                            batch[0].long().to('cuda'), 
                            attention_mask = batch[1].long().to('cuda')
                        )

                    y_true.extend(batch[3].view(-1).numpy())
                    y_pred.extend(outputs[0].to('cpu').float().argmax(1).numpy())
                    y_orig.extend(outputs[0].to('cpu').float().numpy())
                y_true = np.array(y_true)
                y_pred = np.array(y_pred)
                y_orig = np.array(y_orig)
                pd.DataFrame(y_orig, columns = [ 'contradictory', 'incorrect', 'correct' ]).to_csv('logs/{}/{}_epoch_{}_step_{}.csv'.format(EXPERIMENT, key, epoch, step))

                scores.append((
                    key + '_precision_3_way', 
                    precision_score(y_true, y_pred.round(), labels=list(range(num_labels)), average='weighted')
                ))
                scores.append((
                    key + '_recall_3_way', 
                    recall_score(y_true, y_pred.round(), labels=list(range(num_labels)), average='weighted')
                ))
                scores.append((
                    key + '_f1_macro_3_way', 
                    f1_score(y_true, y_pred.round(), average='macro')
                ))
                scores.append((
                    key + '_f1_micro_3_way', 
                    f1_score(y_true, y_pred.round(), average='micro')
                ))
                scores.append((
                    key + '_matthews_3_way', 
                    matthews_corrcoef(y_true, y_pred.round())
                ))
                scores.append((
                    key + '_accuracy_3_way', 
                    accuracy_score(y_true, y_pred.round(), normalize=True)
                ))
                
                y_pred = [ project_semeval(v) for v in y_pred.round() ]
                y_true = [ project_semeval(v) for v in y_true ]
                
                scores.append((
                    key + '_precision_2_way_projected', 
                    precision_score(y_true, y_pred, labels=list(range(2)), average='weighted')
                ))
                scores.append((
                    key + '_recall_2_way_projected', 
                    recall_score(y_true, y_pred, labels=list(range(2)), average='weighted')
                ))
                scores.append((
                    key + '_f1_macro_2_way_projected', 
                    f1_score(y_true, y_pred, average='macro')
                ))
                scores.append((
                    key + '_f1_micro_2_way_projected', 
                    f1_score(y_true, y_pred, average='micro')
                ))
                scores.append((
                    key + '_matthews_2_way_projected', 
                    matthews_corrcoef(y_true, y_pred)
                ))
                scores.append((
                    key + '_accuracy_2_way_projected', 
                    accuracy_score(y_true, y_pred, normalize=True)
                ))
                
            for k, s in scores:
                tensorboard.log_scalar(k, s, step * train_batch_size + len(train_dataset) * (epoch + warp) * train_batch_size)
            model.train()

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0




Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0


RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 1; 10.76 GiB total capacity; 9.83 GiB already allocated; 6.62 MiB free; 21.72 MiB cached)