In [1]:
import re
import gc
import pickle
import numpy as np
import pandas as ps
from scipy.stats import spearmanr
from tqdm.auto import tqdm
from pathlib import Path
from itertools import chain
from collections import Counter
from abc import ABC, abstractmethod
from typing import List, Dict, Tuple, Set
from transformers import BertTokenizer

import numpy as np
import pandas as ps
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
data_dir = Path('..') / 'data'

train_df = ps.read_csv(data_dir / 'train.csv')
test_df = ps.read_csv(data_dir / 'test.csv')


class AvgModel:
    def __init__(self, *models):
        self.models = [torch.jit.load(str(m), map_location=device) for m in models]
        for model in self.models:
            model = model.eval()
    
    def __call__(self, *inputs):
        outputs = []
        for m in self.models:
            out = m(*inputs)
            out = torch.sigmoid(out)
            outputs.append(out)
        res = torch.stack(outputs, dim=0)
        return torch.mean(res, 0)
        

model = AvgModel(
    Path('..') / 'bert_large_f_1.pt',  # 0.3940
    Path('..') / 'bert_large_f_2.pt',  # 0.3908
)


print(train_df.shape)
print(test_df.shape)

(6079, 41)
(476, 11)


In [3]:
targets = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'
]

text_columns = [
    'question_title', 
    'question_body', 
    'answer'
]

In [4]:
from math import floor, ceil

MAX_LEN = 512
MAX_QUESTION_LEN = 250
MAX_ANSWER_LEN = 259
SEP_TOKEN_ID = 102


class TransformerFieldsDataset(Dataset):
    def __init__(self, 
                 df: ps.DataFrame,
                 tokenizer_dir: str,
                 field: str = None,
                 train_mode: bool = True,
                 **kwargs):
        self.df: ps.DataFrame = df
        self.field = field
        self.train_mode = train_mode
        self.tokenizer: BertTokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
        self.PAD = self.tokenizer.vocab["[PAD]"]  # or 0 token

    def __len__(self):
        return self.df.shape[0]

    def _select_tokens(self, tokens, max_num):
        if len(tokens) <= max_num:
            return tokens
        if self.train_mode:
            num_remove = len(tokens) - max_num
            remove_start = np.random.randint(0, len(tokens) - num_remove - 1)
            return tokens[:remove_start] + tokens[remove_start + num_remove:]
        else:
            return tokens[:max_num // 2] + tokens[-(max_num - max_num // 2):]

    def _build_tokens(self, title, question, answer):
        title_body = self._select_tokens(
            self.tokenizer.tokenize(title + "," + question), 
            max_num=MAX_QUESTION_LEN
        )
        ans = self._select_tokens(
            self.tokenizer.tokenize(answer), 
            max_num=MAX_ANSWER_LEN
        )
        tokens = ["[CLS]"] + title_body + ["[SEP]"] + ans + ["[SEP]"]
        return tokens

    def _build_segments(self, tokens):
        segments = []
        # first_sep = True
        current_segment_id = 0
        for token in tokens:
            segments.append(current_segment_id)
            if token == "[SEP]":
                current_segment_id = 1
                # if first_sep:
                #     first_sep = False 
                # else:
                #     current_segment_id = 1
        return segments

    def __getitem__(self, idx):
        index = self.df.index[idx]
        title = self.df.at[index, "question_title"]
        body = self.df.at[index, "question_body"]
        answer = self.df.at[index, "answer"]

        tokens = self._build_tokens(title, body, answer)
        segments = self._build_segments(tokens)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < MAX_LEN:
            token_ids += [self.PAD] * (MAX_LEN - len(token_ids))
        if len(segments) < MAX_LEN:
            segments += [self.PAD] * (MAX_LEN - len(segments))
        
        token_ids = torch.LongTensor(token_ids)
        segments = torch.LongTensor(segments)
        return token_ids, segments

In [5]:
tokenizer = BertTokenizer.from_pretrained("../data/uncased_L-12_H-768_A-12")


def combined_len(title, body, answer):
    title_body = tokenizer.tokenize(title + "," + body)
    ans = tokenizer.tokenize(answer)
    tokens = ["[CLS]"] + title_body + ["[SEP]"] + ans + ["[SEP]"]
    return len(tokens)


def field_len(feature):
    tokens = tokenizer.tokenize(feature)
    return len(tokens)


for df in (train_df, test_df):
    df["sequences_len"] = df.apply(lambda row: combined_len(row["question_title"], row["question_body"], row["answer"]), axis=1)
    df["title_len"] = df["question_title"].apply(field_len)
    df["body_len"] = df["question_body"].apply(field_len)
    df["answer_len"] = df["answer"].apply(field_len)

In [6]:
long_seqs_train_df = train_df[train_df["sequences_len"] > 512]
short_seqs_train_df = train_df[train_df["sequences_len"] <= 512]

In [7]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)

## checking performance on long sequences

In [8]:
dataset = TransformerFieldsDataset(
    long_seqs_train_df, 
    tokenizer_dir="../data/uncased_L-12_H-768_A-12",
    train_mode=False
)
loader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=False,
)

num_batches = len(loader)
preds = []
with torch.no_grad():
    for batch_idx, (seq, seg) in enumerate(loader):
        seq, seg = seq.to(device), seg.to(device)
        out = model(seq, seg)
        out = out.detach().cpu().numpy()
        preds.append(out)
        print(f'{batch_idx + 1:4d}/{num_batches}', end='\r')

preds = np.vstack(preds)
print(f'Test shapes - {preds.shape}')

Test shapes - (1628, 30)


In [9]:
print('Shapes', preds.shape, long_seqs_train_df[targets].values.shape)

Shapes (1628, 30) (1628, 30)


In [11]:
compute_spearmanr(long_seqs_train_df[targets].values, preds)

0.4610229856230065

In [12]:
dataset = TransformerFieldsDataset(
    short_seqs_train_df, 
    tokenizer_dir="../data/uncased_L-12_H-768_A-12",
    train_mode=False
)
loader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=False,
)

num_batches = len(loader)
preds = []
with torch.no_grad():
    for batch_idx, (seq, seg) in enumerate(loader):
        seq, seg = seq.to(device), seg.to(device)
        out = model(seq, seg)
        out = out.detach().cpu().numpy()
        preds.append(out)
        print(f'{batch_idx + 1:4d}/{num_batches}', end='\r')

preds = np.vstack(preds)
print(f'Test shapes - {preds.shape}')

Test shapes - (4451, 30)


In [13]:
print('Shapes', preds.shape, short_seqs_train_df[targets].values.shape)

Shapes (4451, 30) (4451, 30)


In [14]:
compute_spearmanr(short_seqs_train_df[targets].values, preds)

0.5087773623224288