In [None]:
import os
import pandas as pd
import random
from scipy import stats
from scipy.stats import spearmanr
import numpy as np
import re
from sklearn import model_selection
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from collections import OrderedDict
import joblib

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
#from apex import amp

from transformers import AdamW, get_linear_schedule_with_warmup

import logging
from logger import logger
logging.getLogger("ppb.tokenization_utils").setLevel(logging.ERROR)
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = int(os.environ.get("EPOCHS", 10))
TRAIN_BATCH_SIZE = int(os.environ.get("TRAIN_BATCH_SIZE", 4))
TEST_BATCH_SIZE = int(os.environ.get("TEST_BATCH_SIZE", 4))
BASE_MODEL = 'xlnet-large-cased'#'xlnet-base-cased'
TRAINING_DATASET = os.environ.get("TRAINING_DATASET", "../input/google-quest-challenge/train_group_folds.csv")
TEST_DATASET = os.environ.get("TEST_DATASET", "../input/google-quest-challenge/test.csv")
SAMPLE_SUBMISSION = os.environ.get("SAMPLE_SUBMISSION", "../input/google-quest-challenge/sample_submission.csv")
ACCUMULATION_STEPS = int(os.environ.get("ACCUMULATION_STEPS", 2))

SEED = 19840916

In [None]:
def seed_torch(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max"):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.module.state_dict(), model_path)
            #torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score


In [None]:
def clean_tag(text):
    if '[math]' in text:
        text = re.sub('\[math\].*?math\]', '[formula]', text)
    if 'http' in text or 'www' in text:
        text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', '[url]', text)
    #text = re.sub('<\/?[^>]*>', '[tag]', text)
    #text = re.sub('{\/?[^>]*}', '[code]', text)
    
    return text

In [None]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_example_to_features(text1,
                                text2,
                                max_seq_length,
                                tokenizer,
                                cls_token_at_end=True,
                                cls_token='[CLS]',
                                cls_token_segment_id=1,
                                sep_token='[SEP]',
                                sep_token_extra=False,
                                pad_on_left=False,
                                pad_token=0,
                                pad_token_segment_id=0,
                                sequence_a_segment_id=0,
                                sequence_b_segment_id=1,
                                mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    example = (text1, text2)

    tokens_a = tokenizer.tokenize(example[0])

    tokens_b = None
    if example[1] is not None:
        tokens_b = tokenizer.tokenize(example[1])
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
        special_tokens_count = 4 if sep_token_extra else 3
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
    else:
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens_a) > max_seq_length - special_tokens_count:
            tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = tokens_a + [sep_token]
    if sep_token_extra:
        # roberta uses an extra separator b/w pairs of sentences
        tokens += [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if tokens_b:
        tokens += tokens_b + [sep_token]
        segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    if cls_token_at_end:
        tokens = tokens + [cls_token]
        segment_ids = segment_ids + [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids

In [None]:
class DatasetTraining:
    def __init__(self, qtitle, qbody, answer, targets, tokenizer, max_length, category=None):
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer = answer
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.category = category

    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer_text = str(self.answer[item])

        ques_ids, ques_mask, ques_token_type_ids = convert_example_to_features(
            text1=question_title,
            text2=question_body,
            max_seq_length=self.max_length,
            tokenizer=self.tokenizer
        )
        if self.category is not None:
            category_text = str(self.category[item])
            answer_ids, answer_mask, answer_token_type_ids = convert_example_to_features(
                text1=answer_text,
                text2=category_text,
                max_seq_length=self.max_length,
                tokenizer=self.tokenizer
            )
        else:
            answer_ids, answer_mask, answer_token_type_ids = convert_example_to_features(
                text1=answer_text,
                text2=None,
                max_seq_length=self.max_length,
                tokenizer=self.tokenizer
            )

        return {
            'question_ids': torch.tensor(ques_ids, dtype=torch.long),
            'question_mask': torch.tensor(ques_mask, dtype=torch.long),
            'question_token_type_ids': torch.tensor(ques_token_type_ids, dtype=torch.long),
            'answer_ids': torch.tensor(answer_ids, dtype=torch.long),
            'answer_mask': torch.tensor(answer_mask, dtype=torch.long),
            'answer_token_type_ids': torch.tensor(answer_token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item, :], dtype=torch.float)
        }


class DatasetTest:
    def __init__(self, qtitle, qbody, answer, tokenizer, max_length, category=None):
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer = answer
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.category = category

    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer_text = str(self.answer[item])

        if self.category is not None:
            category_text = str(self.category[item])
            answer_ids, answer_mask, answer_token_type_ids = convert_example_to_features(
                text1=answer_text,
                text2=category_text,
                max_seq_length=self.max_length,
                tokenizer=self.tokenizer
            )
        else:
            answer_ids, answer_mask, answer_token_type_ids = convert_example_to_features(
                text1=answer_text,
                text2=None,
                max_seq_length=self.max_length,
                tokenizer=self.tokenizer
            )

        ques_ids, ques_mask, ques_token_type_ids = convert_example_to_features(
            text1=question_title,
            text2=question_body,
            max_seq_length=self.max_length,
            tokenizer=self.tokenizer
        )

        return {
            'question_ids': torch.tensor(ques_ids, dtype=torch.long),
            'question_mask': torch.tensor(ques_mask, dtype=torch.long),
            'question_token_type_ids': torch.tensor(ques_token_type_ids, dtype=torch.long),
            'answer_ids': torch.tensor(answer_ids, dtype=torch.long),
            'answer_mask': torch.tensor(answer_mask, dtype=torch.long),
            'answer_token_type_ids': torch.tensor(answer_token_type_ids, dtype=torch.long)
        }


In [None]:
class XLNetBaseUncased(nn.Module):
    def __init__(self):
        super(XLNetBaseUncased, self).__init__()

        self.pretrain = transformers.XLNetModel.from_pretrained(BASE_MODEL)
        self.drop = nn.Dropout(0.3)
        self.hidden_size = 1024

        self.out1 = nn.Linear(self.hidden_size, 21)
        self.out2 = nn.Linear(self.hidden_size * 2, 9)

        self.W_s1 = nn.Linear(self.hidden_size, 256, bias=True)
        self.W_s2 = nn.Linear(256, 128, bias=True)

    def attention_net(self, lstm_output):
        """
        Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
        encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of
        the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully
        connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e.,
        pos & neg.
        Arguments
        ---------
        lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
        ---------
        Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
                  attention to different parts of the input sentence.
        Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
                      attn_weight_matrix.size() = (batch_size, 30, num_seq)
        """
        attn_weight_matrix = self.W_s2(torch.tanh(self.W_s1(lstm_output)))  # / self.temper
        attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
        attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)

        return attn_weight_matrix

    def forward(self,
                question_ids,
                question_mask,
                question_token_type_ids,
                answer_ids,
                answer_mask,
                answer_token_type_ids,
                labels=None
                ):
        q_out = self.pretrain(question_ids, attention_mask=question_mask)[0]
        q_weight_matrix = self.attention_net(q_out)
        q_out = torch.matmul(q_weight_matrix, q_out)
        q_out = q_out[:, 0, :]
        q_out = self.drop(q_out)

        a_out = self.pretrain(answer_ids, attention_mask=answer_mask)[0]
        a_weight_matrix = self.attention_net(a_out)
        a_out = torch.matmul(a_weight_matrix, a_out)
        a_out = a_out[:, 0, :]
        a_out = self.drop(a_out)

        final1 = self.out1(q_out)
        final2 = self.out2(torch.cat((q_out, a_out), 1))

        final = torch.cat([final1, final2], 1)
        return final

In [None]:
def loss_fn(preds, labels):
    class_loss = 0.
    for i in range(30):
        class_loss += nn.BCEWithLogitsLoss()(preds[:, i:i+1], labels[:, i:i+1])
    return class_loss


In [None]:
def train(epoch, dataset, data_loader, model, optimizer, accumulation_steps):
    losses = AverageMeter()
    fin_targets = []
    fin_outputs = []

    model.train()
    # optimizer.zero_grad()

    tk0 = tqdm(data_loader, total=int(len(dataset) / data_loader.batch_size))
    for bi, d in enumerate(tk0):
        question_ids = d["question_ids"]
        question_mask = d["question_mask"]
        question_token_type_ids = d["question_token_type_ids"]
        answer_ids = d["answer_ids"]
        answer_mask = d["answer_mask"]
        answer_token_type_ids = d["answer_token_type_ids"]
        targets = d["targets"]

        question_ids = question_ids.to(DEVICE, dtype=torch.long)
        question_mask = question_mask.to(DEVICE, dtype=torch.long)
        question_token_type_ids = question_token_type_ids.to(DEVICE, dtype=torch.long)
        answer_ids = answer_ids.to(DEVICE, dtype=torch.long)
        answer_mask = answer_mask.to(DEVICE, dtype=torch.long)
        answer_token_type_ids = answer_token_type_ids.to(DEVICE, dtype=torch.long)
        targets = targets.to(DEVICE, dtype=torch.float)

        outputs = model(
            question_ids=question_ids,
            question_mask=question_mask,
            question_token_type_ids=question_token_type_ids,
            answer_ids=answer_ids,
            answer_mask=answer_mask,
            answer_token_type_ids=answer_token_type_ids,
            labels=targets
        )
        loss = loss_fn(outputs, targets)

        targets_np = targets.cpu().detach().numpy()
        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy()
        fin_targets.append(targets_np)
        fin_outputs.append(outputs_np)

        loss.backward()

        if (bi + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        losses.update(loss.item(), question_ids.size(0))
        tk0.set_postfix(loss=losses.avg)

    spear = 0
    for i in range(30):
        spear += np.nan_to_num(
            spearmanr(np.concatenate(fin_targets)[:, i],
                      np.concatenate(fin_outputs)[:, i]
                     ).correlation / 30)

    log = OrderedDict([
        ('loss', losses.avg),
        ('spearman', spear)
    ])
    tk0.close()
    return log

In [None]:
def evaluate(dataset, data_loader, model, df_train, sample):
    losses = AverageMeter()
    model.eval()

    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=int(len(dataset) / data_loader.batch_size))
        for bi, d in enumerate(tk0):
            question_ids = d["question_ids"]
            question_mask = d["question_mask"]
            question_token_type_ids = d["question_token_type_ids"]
            answer_ids = d["answer_ids"]
            answer_mask = d["answer_mask"]
            answer_token_type_ids = d["answer_token_type_ids"]
            targets = d["targets"]

            question_ids = question_ids.to(DEVICE, dtype=torch.long)
            question_mask = question_mask.to(DEVICE, dtype=torch.long)
            question_token_type_ids = question_token_type_ids.to(DEVICE, dtype=torch.long)
            answer_ids = answer_ids.to(DEVICE, dtype=torch.long)
            answer_mask = answer_mask.to(DEVICE, dtype=torch.long)
            answer_token_type_ids = answer_token_type_ids.to(DEVICE, dtype=torch.long)
            targets = targets.to(DEVICE, dtype=torch.float)

            outputs = model(
                question_ids=question_ids,
                question_mask=question_mask,
                question_token_type_ids=question_token_type_ids,
                answer_ids=answer_ids,
                answer_mask=answer_mask,
                answer_token_type_ids=answer_token_type_ids,
                labels=targets
            )
            loss = loss_fn(outputs, targets)

            targets_np = targets.cpu().detach().numpy()
            outputs_np = torch.sigmoid(outputs).cpu().detach().numpy()
            fin_targets.append(targets_np)
            fin_outputs.append(outputs_np)

            losses.update(loss.item(), question_ids.size(0))
            tk0.set_postfix(loss=losses.avg)

    fin_outputs = np.vstack(fin_outputs)
    fin_targets = np.vstack(fin_targets)
    spear = []
    for jj in range(fin_targets.shape[1]):
        p1 = list(fin_targets[:, jj])
        p2 = list(fin_outputs[:, jj])
        coef, _ = stats.spearmanr(p1, p2)
        coef = np.nan_to_num(coef)
        spear.append(coef)
    spear = np.mean(spear)

    log = OrderedDict([
        ('loss', losses.avg),
        ('spearman', spear)
    ])
    tk0.close()
    return log

In [None]:
if __name__ == "__main__":
    seed_torch()

    df = pd.read_csv(TRAINING_DATASET).fillna("none")
    df_test = pd.read_csv(TEST_DATASET).fillna("none")
    sample = pd.read_csv(SAMPLE_SUBMISSION)

    target_cols = list(sample.drop("qa_id", axis=1).columns)
    for col in target_cols:
        u, c = np.unique(df[col].values, return_counts=True)
        v = (np.cumsum(c) - c[0]) / c.sum()
        mn = v.min()
        mx = v.max()
        v = (v - mn) / (mx - mn)
        d = dict(zip(u, v))
        df[col + '_ord'] = df[col].map(d)

    target_cols_ord = [x + '_ord' for x in target_cols]
    
    df['question_title'] = df['question_title'].apply(clean_tag)
    df['question_body'] = df['question_body'].apply(clean_tag)
    df['answer'] = df['answer'].apply(clean_tag)

    max_len = 512

    tokenizer = transformers.XLNetTokenizer.from_pretrained(BASE_MODEL)
    tokenizers = {"tokenizer": tokenizer, "max_len": max_len}
    joblib.dump(tokenizers, f"tokenizers_{BASE_MODEL}.pkl")

    NFOLDS = 5

    CURRENT_FOLD = 0
    for fold in range(NFOLDS):
            
        print('Start: ', fold, 'fold')
        CURRENT_FOLD = fold
        
        df_train = df.loc[df.kfold != fold, :]
        df_valid = df.loc[df.kfold == fold, :]
        #train_targets = df_train[target_cols].values
        train_targets = df_train[target_cols_ord].values
        valid_targets = df_valid[target_cols].values

        print(df_train.shape, df_valid.shape)

        train_qtitle = df_train.question_title.values.astype(str).tolist()
        train_qbody = df_train.question_body.values.astype(str).tolist()
        train_answer = df_train.answer.values.astype(str).tolist()

        valid_qtitle = df_valid.question_title.values.astype(str).tolist()
        valid_qbody = df_valid.question_body.values.astype(str).tolist()
        valid_answer = df_valid.answer.values.astype(str).tolist()

        train_dataset = DatasetTraining(
            qtitle=train_qtitle,
            qbody=train_qbody,
            answer=train_answer,
            targets=train_targets,
            tokenizer=tokenizer,
            max_length=max_len
        )
        train_data_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=TRAIN_BATCH_SIZE,
            shuffle=True,
            num_workers=4
        )

        valid_dataset = DatasetTraining(
            qtitle=valid_qtitle,
            qbody=valid_qbody,
            answer=valid_answer,
            targets=valid_targets,
            tokenizer=tokenizer,
            max_length=max_len
        )
        valid_data_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=TEST_BATCH_SIZE,
            shuffle=True,
            num_workers=4
        )

        model = XLNetBaseUncased()
        model.to(DEVICE)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE / ACCUMULATION_STEPS * EPOCHS)
        optimizer = AdamW(optimizer_grouped_parameters, lr=1.5e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=num_train_steps
        )

        #model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

        if torch.cuda.device_count() > 1:
            logger.info("Let's use %s GPUs!" % torch.cuda.device_count())
            model = nn.DataParallel(model)

        es = EarlyStopping(patience=5, mode="max")

        logger.info("Training batch size: %s" % TRAIN_BATCH_SIZE)
        logger.info("Test batch size: %s" % TEST_BATCH_SIZE)
        logger.info("Epochs: %s" % EPOCHS)
        logger.info("Number of training samples: %s" % len(train_dataset))
        logger.info("Number of validation samples: %s" % len(valid_dataset))

        for epoch in range(EPOCHS):
            logger.info("Training Epoch: %s" % epoch)
            log = train(epoch, train_dataset, train_data_loader, model, optimizer, accumulation_steps=ACCUMULATION_STEPS)
            logger.info("Validation Epoch: %s" % epoch)
            val_log = evaluate(valid_dataset, valid_data_loader, model, df_train, sample)
            es(val_log["spearman"], model, model_path=f"./checkpoints/{BASE_MODEL}_{CURRENT_FOLD}.bin")
            if es.early_stop:
                logger.info("Early stopping")
                break

        del model, train_dataset, train_data_loader, valid_dataset, valid_data_loader, optimizer
        torch.cuda.empty_cache()
