In [1]:
import re
import gc
import json
import pickle
import numpy as np
import pandas as ps
from pandas import DataFrame
from scipy.stats import spearmanr
from tqdm.auto import tqdm
from pathlib import Path
from itertools import chain
from collections import Counter
from abc import ABC, abstractmethod
from typing import List, Dict, Tuple, Set
from transformers import BertTokenizer

import numpy as np
import pandas as ps
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
data_dir = Path('..') / 'data'

train_df = ps.read_csv(data_dir / 'train.csv')
test_df = ps.read_csv(data_dir / 'test.csv')


class AvgModel:
    def __init__(self, *models):
        self.models = [torch.jit.load(str(m), map_location=device) for m in models]
        for model in self.models:
            model = model.eval()
    
    def __call__(self, *inputs):
        outputs = []
        for m in self.models:
            out = m(*inputs)
            out = torch.sigmoid(out)
            outputs.append(out)
        res = torch.stack(outputs, dim=0)
        return torch.mean(res, 0)
        

model = AvgModel(
    Path('..') / 'lstm_gru_attn_embs_0.pt',  # 0.3940
    Path('..') / 'lstm_gru_attn_embs_1.pt',  # 0.3908
)


print(train_df.shape)
print(test_df.shape)

(6079, 41)
(476, 11)


In [3]:
targets = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'
]

text_columns = ["question_title", "question_body", "answer", "category", "host"]

In [4]:
def pad_sequences(sequences: list,
                  max_len: int,
                  value: int = 0,
                  padding: str = "pre",
                  dtype=np.int32) -> np.ndarray:
    """
    Pad sequences with specified value.
    
    Example of different padding strategies:

    >>> seqs = [[1, 2, 3], [4, 5], [6]]
    >>> pad_sequences(seqs, max_len=3, padding="post")
    array([[1, 2, 3],
       [4, 5, 0],
       [6, 0, 0]], dtype=int32)
    >>> pad_sequences(seqs, max_len=3, padding="pre")
    array([[1, 2, 3],
       [0, 4, 5],
       [0, 0, 6]], dtype=int32)
    """

    if not max_len > 0:
        raise ValueError("`max_len` should be greater than 0")

    if padding not in {"pre", "post"}:
        raise ValueError("`padding` should be one of `pre` or `post`")

    features = np.full(
        shape=(len(sequences), max_len),
        fill_value=value,
        dtype=dtype
    )

    for idx, row in enumerate(sequences):
        if len(row):
            if padding == "pre":
                features[idx, -len(row):] = np.array(row)[:max_len]
            else:
                features[idx, : len(row)] = np.array(row)[:max_len]

    return features


class DummyTokenizer:
    def __init__(self,
                 index2word,
                 index2host,
                 index2category,
                 text_fields,
                 host_field,
                 category_field,
                 unknown_token = "<unk>"):
        self.idx2word = index2word
        self.word2idx = {w: idx for idx, w in enumerate(index2word)}

        self.idx2host = index2host
        self.host2idx = {h: idx for idx, h in enumerate(index2host)}

        self.idx2category = index2category
        self.category2idx = {c: idx for idx, c in enumerate(index2category)}

        self.text_fields = text_fields
        self.host_field = host_field
        self.category_field = category_field

        self.separate_chars = [
            ',', '.', '"', ':', ')', '(', '-', '!', '?', 
            '|', ';', "'", '$', '&', '/', '[', ']', '>', 
            '%', '=', '#', '*', '+', '\\', '•',  '~', '@', 
            '£', '·', '_', '{', '}', '©', '^', '®', '`',
            '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
            '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', 
            '\xa0', '\t', '“', '★', '”', '–', '●', 'â', 
            '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±',
            '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—',
            '‹', '─', '\u3000', '\u202f', '▒', '：', '¼', 
            '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', 
            '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', 
            '¾', 'Ã', '⋅', '‘', '∞', '«', '∙', '）', '↓', 
            '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', 
            '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
            '¹', '≤', '‡', '√', 
        ]
        self.lower = True
        self.split = " "
        self.UNK = unknown_token

    def tokenize(self, state: dict) -> dict:
        """
        Return tokenized (for each field: str -> list[str]) state
        """
        for txt_field in self.text_fields:
            s = state[txt_field]

            if self.lower:
                s = s.lower()

            s = re.sub('[0-9]{5,}', '#####', s)
            s = re.sub('[0-9]{4}', '####', s)
            s = re.sub('[0-9]{3}', '###', s)
            s = re.sub('[0-9]{2}', '##', s)

            for c in self.separate_chars:
                s = s.replace(c, f" {c} ")
            
            state[txt_field] = s
        for field in (self.host_field, self.category_field):
            state[field] = [state[field]]
        return state
    
    def convert_tokens_to_ids(self, state: dict) -> dict:
        for txt_field in self.text_fields:
            state[txt_field] = [self.word2idx[token if token in self.word2idx else self.UNK] 
                                for token in state[txt_field]]
        
        state[self.host_field] = [self.host2idx[host if host in self.host2idx else self.UNK] 
                                  for host in state[self.host_field]]
        state[self.category_field] = [self.category2idx[category if category in self.category2idx else self.UNK] 
                                      for category in state[self.category_field]]
        return state

    @staticmethod
    def from_file(tokenizer_dir):
        with open(tokenizer_dir, 'r') as f:
            content = json.load(f)
        
        return DummyTokenizer(
            index2word=content["text"],
            index2host=content["host"],
            index2category=content["category"],
            text_fields=["question_title", "question_body", "answer"],
            host_field="host",
            category_field="category",
        )


In [5]:
class TokenizedFieldsDataset(Dataset):
    def __init__(self, 
                 df: DataFrame, 
                 feature_cols: List[str], 
                 target: List[str], 
                 tokenizer_dir: str, 
                 field: str = None):
        self.df: DataFrame = df
        self.features: List[str] = feature_cols
        self.target: List[str] = target
        self.tokenizer: DummyTokenizer = DummyTokenizer.from_file(tokenizer_dir)
        self.field = field

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        index = self.df.index[idx]
        state = {c: self.df.at[index, c] for c in self.features}
        state = self.tokenizer.tokenize(state)               # split strings to lists of tokens
        state = self.tokenizer.convert_tokens_to_ids(state)  # map tokens to ids
        target = [self.df.at[index, c] for c in self.target] 
        return state, target


class FieldsCollator:
    def __init__(self, 
                 fields: list,
                 ignore_fields: list = None,
                 is_test: bool = False, 
                 percentile: int = 100, 
                 max_len: int = 500):
        self.fields = fields
        ignore_fields = {} if not ignore_fields else set(ignore_fields)
        self.ignore_fields = ignore_fields
        self.is_test = is_test
        self.percentile = percentile
        self.max_len = max_len

    def __call__(self, batch):
        if self.is_test:
            sequences = batch
        else:
            sequences, labels = zip(*batch)

        res = {}
        for f in self.fields:
            seq = [item[f] for item in sequences]
            lengths = np.array(list(map(len, seq)))
            max_len = int(np.percentile(lengths, self.percentile))
            if f not in self.ignore_fields:
                max_len = min(int(np.percentile(lengths, self.percentile)), self.max_len)
            seq = torch.from_numpy(pad_sequences(seq, max_len, padding='post'))
            seq = seq.long()
            res[f] = seq

        if self.is_test:
            return res
        else:
            res["targets"] = torch.FloatTensor(labels)
            return res

In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("../data/uncased_L-12_H-768_A-12")


def combined_len(title, body, answer):
    title_body = bert_tokenizer.tokenize(title + "," + body)
    ans = bert_tokenizer.tokenize(answer)
    tokens = ["[CLS]"] + title_body + ["[SEP]"] + ans + ["[SEP]"]
    return len(tokens)


def field_len(feature):
    tokens = bert_tokenizer.tokenize(feature)
    return len(tokens)


for df in (train_df, test_df):
    df["sequences_len"] = df.apply(lambda row: combined_len(row["question_title"], row["question_body"], row["answer"]), axis=1)
    df["title_len"] = df["question_title"].apply(field_len)
    df["body_len"] = df["question_body"].apply(field_len)
    df["answer_len"] = df["answer"].apply(field_len)

In [7]:
long_seqs_train_df = train_df[train_df["sequences_len"] > 512]
short_seqs_train_df = train_df[train_df["sequences_len"] <= 512]

In [8]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)

## checking performance on long sequences

In [9]:
dataset = TokenizedFieldsDataset(
    long_seqs_train_df, 
    feature_cols=text_columns,
    target=targets,
    tokenizer_dir="../data/vocab.json",
)
loader = DataLoader(
    dataset=dataset,
    collate_fn=FieldsCollator(
        fields=text_columns,
        ignore_fields=["category", "host"],
        max_len=1500,
        percentile=90,
    ),
    batch_size=32,
    shuffle=False,
)

num_batches = len(loader)
preds = []
with torch.no_grad():
    for batch_idx, tensors in enumerate(loader):
        title = tensors["question_title"].to(device)
        body = tensors["question_body"].to(device)
        ans = tensors["answer"].to(device)
        cat = tensors["category"].to(device)
        host = tensors["host"].to(device)
        out = model(title, body, ans, cat, host)
        out = out.detach().cpu().numpy()
        preds.append(out)
        print(f'{batch_idx + 1:4d}/{num_batches}', end='\r')

preds = np.vstack(preds)
print(f'Test shapes - {preds.shape}')

Test shapes - (1628, 30)


In [10]:
print('Shapes', preds.shape, long_seqs_train_df[targets].values.shape)

Shapes (1628, 30) (1628, 30)


In [11]:
compute_spearmanr(long_seqs_train_df[targets].values, preds)

0.4250857888299656

In [12]:
dataset = TokenizedFieldsDataset(
    short_seqs_train_df, 
    feature_cols=text_columns,
    target=targets,
    tokenizer_dir="../data/vocab.json",
)
loader = DataLoader(
    dataset=dataset,
    collate_fn=FieldsCollator(
        fields=text_columns,
        ignore_fields=["category", "host"],
        max_len=1500,
        percentile=90,
    ),
    batch_size=32,
    shuffle=False,
)

num_batches = len(loader)
preds = []
with torch.no_grad():
    for batch_idx, tensors in enumerate(loader):
        title = tensors["question_title"].to(device)
        body = tensors["question_body"].to(device)
        ans = tensors["answer"].to(device)
        cat = tensors["category"].to(device)
        host = tensors["host"].to(device)
        out = model(title, body, ans, cat, host)
        out = out.detach().cpu().numpy()
        preds.append(out)
        print(f'{batch_idx + 1:4d}/{num_batches}', end='\r')

preds = np.vstack(preds)
print(f'Test shapes - {preds.shape}')

Test shapes - (4451, 30)


In [13]:
print('Shapes', preds.shape, short_seqs_train_df[targets].values.shape)

Shapes (4451, 30) (4451, 30)


In [14]:
compute_spearmanr(short_seqs_train_df[targets].values, preds)

0.4406158437019835