**TRAINING CONFIGS**

* Training max sequence length: 320
* Inference max sequence length: 320
* Preprocessing win_size: 200 (words)
* Batch size: 4
* Head: Arcface (0.5, 10.0, easy_margin=True, centers=1)
* Training SupportSet K: 3
* Cased
* Balanced Training Group Sampling
* None Overlap Support Group Sampling
* Support/Query groups are all unique groups.

In [1]:
WIN_SIZE = 200
SEQUENCE_LENGTH = 320

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/pretrainedrobertabase'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pretrainedrobertabase/config.json
/kaggle/input/pretrainedrobertabase/merges.txt
/kaggle/input/pretrainedrobertabase/vocab.json
/kaggle/input/pretrainedrobertabase/tf_model.h5
/kaggle/input/pretrainedrobertabase/tokenizer_config.json
/kaggle/input/pretrainedrobertabase/special_tokens_map.json


In [3]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices("GPU")
for i in range(len(physical_devices)):
    tf.config.experimental.set_memory_growth(physical_devices[i], True)

import pandas as pd
import gc
import json
import numpy as np
import random
from tqdm import tqdm
import transformers
from transformers import *
import re
from collections import Counter
import glob
from functools import partial
from multiprocessing import Pool
from sklearn.metrics.pairwise import cosine_similarity

tf.random.set_seed(42)
random.seed(42)
np.random.seed(42)

transformers.__version__

  '"sox" backend is being deprecated. '


'4.4.2'

In [4]:
def generate_s_e_window_sliding(sample_len, win_size, step_size):
    start = 0
    end = win_size
    s_e = []
    s_e.append([start, end])
    while end < sample_len:
        start += step_size
        end = start + win_size
        s_e.append([start, end])

    s_e[-1][0] -= s_e[-1][1] - sample_len
    s_e[-1][0] = max(s_e[-1][0], 0)
    s_e[-1][1] = sample_len
    return s_e

In [5]:
train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")

In [6]:
clean_label = train_df.cleaned_label.tolist()
dataset_label = train_df.cleaned_label.tolist()
dataset_title = train_df.dataset_title.tolist()

In [7]:
temp_1 = [x.lower().strip() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower().strip() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower().strip() for x in train_df['cleaned_label'].unique()]
all_train_labels = list(set(temp_1 + temp_2 + temp_3))

In [8]:
TEST_IDS = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/test/**")
TEST_IDS = [TEST_ID.split("/")[-1].split(".")[0] for TEST_ID in TEST_IDS]

In [9]:
win_size = WIN_SIZE

def process(i):
    ids = []
    texts = []
    labels = []
    pub_titles = []
    cleaned_labels = []
    x = json.load(open(
        f"/kaggle/input/coleridgeinitiative-show-us-the-data/test/{TEST_IDS[i]}.json","rt"))
    label = "unknow"
    full_text = ""
    unique_id = []
    for section in x:
        raw_text = section["text"].replace("\n", " ")
#         raw_text_encode = tokenizer.encode(raw_text)[1:-1]
        raw_text_encode = raw_text.split()
        s_e = generate_s_e_window_sliding(len(raw_text_encode), win_size, int(0.75*win_size))
        for (s, e) in s_e:
#             sent = tokenizer.decode(raw_text_encode[s:e]).strip()
            sent = " ".join(raw_text_encode[s:e]).strip()
            texts.append(sent)
            ids.append(TEST_IDS[i])
            labels.append(label)
        full_text += section["text"].replace("\n", " ") + " "
    
    unique_id = TEST_IDS[i]
    full_text = full_text.strip()

    results = {}
    results["id"] = ids
    results["text"] = texts
    results["label"] = labels
    results["unique_id"] = unique_id
    results["full_text"] = full_text
    return results
        
# define map iterator
def iterator_data(items_list):
    for item in items_list:
        yield item

iterator_data = iterator_data(range(len(TEST_IDS)))
p = Pool(8)

partial_fn = partial(process)
train_map = p.imap(
    partial_fn,
    tqdm(iterator_data, total=len(TEST_IDS), desc="[Preprocessing TestSet]"),
    chunksize=10,
)

results = []
for result in tqdm(train_map):
    results.append(result)

ids = []
texts = []
labels = []
unique_ids = []
full_texts = []
for result in tqdm(results):
    ids.extend(result["id"])
    texts.extend(result["text"])
    labels.extend(result["label"])
    unique_ids.append(result["unique_id"])
    full_texts.append(result["full_text"])
    
test_df = pd.DataFrame()
test_df["id"] = ids
test_df["text"] = texts
test_df["label"] = labels
test_df["group"] = [-1] * len(ids)
test_df["title"] = [""] * len(ids)

p.close()

[Preprocessing TestSet]:   0%|          | 0/4 [00:00<?, ?it/s]
[Preprocessing TestSet]: 100%|██████████| 4/4 [00:00<00:00, 614.08it/s]
4it [00:00, 84.22it/s]
100%|██████████| 4/4 [00:00<00:00, 29799.67it/s]


**Data Loader**

In [10]:
import pandas as pd
from tensorflow.keras.utils import Sequence
import numpy as np
from sklearn.utils import shuffle
from tqdm import tqdm
from transformers import RobertaTokenizerFast
from tensorflow.keras.preprocessing.sequence import pad_sequences
import math


class QueryDataLoader(Sequence):
    def __init__(self, data, batch_size=32):
        self.batch_size = batch_size
        self.data = data.fillna("")
        self.batch_ids = self.data["id"].tolist()
        self.batch_text = self.data["text"].tolist()
        self.batch_label = self.data["label"].tolist()

    def __len__(self):
        return math.ceil(len(self.batch_text) / self.batch_size)

    def __getitem__(self, index):
        id = self.batch_ids[index * self.batch_size : (index + 1) * self.batch_size]
        text = self.batch_text[index * self.batch_size : (index + 1) * self.batch_size]
        label = self.batch_label[
            index * self.batch_size : (index + 1) * self.batch_size
        ]
        classes = [1 if l != "" else 0 for l in label]
        return id, text, label, classes


class SupportQueryDataLoader(Sequence):
    def __init__(
        self,
        data,
        tokenizer,
        training_steps=500,
        batch_size=32,
        is_train=False,
        query_dataloader=None,
        query_masked=False,
        return_query_ids=False,
        return_query_labels=False,
    ):
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.data = data.fillna("")
        self.is_train = is_train
        self.len = training_steps
        self.query_dataloader = query_dataloader
        self.query_masked = query_masked
        self.return_query_ids = return_query_ids
        self.return_query_labels = return_query_labels

        self.on_epoch_end()

    def _create_group_data(self):
        all_unique_group = list(self.data.group.unique())
        for group in all_unique_group:
            self.data_group[group] = list(
                zip(
                    list(self.data[self.data["group"] == group].title),
                    list(self.data[self.data["group"] == group].text),
                    list(self.data[self.data["group"] == group].label),
                )
            )

        self.all_unique_group = all_unique_group

    def on_epoch_end(self):
        if self.is_train:
            for k in list(self.data_group.keys()):
                self.data_group[k] = shuffle(self.data_group[k])

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        # step 1: create support/group data samples
        support_samples = []
        support_labels = []
        support_classes = []
        query_samples = []
        query_labels = []
        query_classes = []
        (
            query_ids,
            query_samples,
            query_labels,
            query_classes,
        ) = self.query_dataloader.__getitem__(index)
        if self.return_query_ids is False:
            query_ids = None

        # step 3: tokenize and return compute sequence label
        query_batch = {}
        query_batch["input_ids"] = []
        query_batch["attention_mask"] = []
        query_batch["token_type_ids"] = []
        query_batch["ids"] = []

        for i in range(len(query_samples)):
            out = self._process_data(
                query_samples[i], query_labels[i], self.query_masked
            )
            query_batch["input_ids"].append(out["input_ids"])
            query_batch["attention_mask"].append(out["attention_mask"])
            query_batch["token_type_ids"].append(out["token_type_ids"])
            if query_ids is not None:
                query_batch["ids"].append(query_ids[i])

        # step 4: padding to max len
        query_batch["input_ids"] = pad_sequences(
            query_batch["input_ids"],
            padding="post",
            value=self.tokenizer.pad_token_id,
        )
        for k in ["attention_mask", "token_type_ids"]:
            pad_value = 0
            query_batch[k] = pad_sequences(
                query_batch[k], padding="post", value=pad_value
            )
        
        for k in list(["input_ids", "attention_mask", "token_type_ids"]):
            query_batch[k] = np.array(query_batch[k]).astype(np.int32)

        return query_batch

    def _process_data(self, inp_string, label_string, masked_label=False):
        input_tokenize = self.tokenizer(
            inp_string, return_offsets_mapping=True, max_length=SEQUENCE_LENGTH, truncation=True
        )
        results = {
            "input_ids": input_tokenize["input_ids"],
            "attention_mask": input_tokenize["attention_mask"],
            "token_type_ids": [0] * len(input_tokenize["input_ids"]),
        }
        return results

**MODELING**

In [11]:
def del_everything(model):
    tf.keras.backend.clear_session()
    del model
    gc.collect()
    sess = tf.compat.v1.keras.backend.get_session()
    del sess
    graph = tf.compat.v1.get_default_graph()
    del graph

In [12]:
import tensorflow as tf
from transformers.optimization_tf import WarmUp, AdamWeightDecay
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import math


class MetricLearningModel(tf.keras.Model):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.main_model = None
        self.support_dense = tf.keras.layers.Dense(units=768, activation=None)
        self.config = config
        self.K = 3

    def _compute_avg_embeddings(self, sequence_embeddings, attentions_mask, K=3):
        embeddings = tf.reduce_mean(
            attentions_mask * sequence_embeddings, axis=1
        )  # [B * K, F]
        if K > 1:
            embeddings = tf.reshape(
                embeddings,
                (-1, K, self.support_dense.units),
            )
            embeddings = tf.reduce_mean(embeddings, axis=1)  # [B, F]
        return embeddings

    def call(
        self,
        inputs,
        training=False,
        sequence_labels=None,
        mask_embeddings=None,
        nomask_embeddings=None,
        use_only_mask=False
    ):
        output_hidden_states = self.main_model(input_ids=inputs[0], attention_mask=inputs[1], training=training)[-2]
        concat_hidden_states = tf.concat(
            output_hidden_states[-1:], axis=-1
        )  # [B * K, T, F]
        concat_hidden_states = self.support_dense(
            concat_hidden_states
        )  # [B * K, T, 768]
        sequence_embeddings = concat_hidden_states[:, 0, :]  # [B * K, 768]
        if sequence_labels is not None:
            sequence_labels = tf.cast(
                sequence_labels, dtype=concat_hidden_states.dtype
            )[..., None]
            mask_embeddings = self._compute_avg_embeddings(
                concat_hidden_states,
                tf.where(sequence_labels == -100, 0.0, sequence_labels),
                self.K,
            )
            nomask_embeddings = self._compute_avg_embeddings(
                concat_hidden_states,
                1.0 - tf.where(sequence_labels == -100, 1.0, sequence_labels),
                K=self.K,
            )
            return sequence_embeddings, mask_embeddings, nomask_embeddings
        else:
            attention_mask = tf.cast(inputs[1], concat_hidden_states.dtype)[
                ..., None
            ]  # [B, T, 1]
            normed_mask_embeddings = tf.nn.l2_normalize(mask_embeddings, axis=1)[..., None]
            normed_nomask_embeddings = tf.nn.l2_normalize(nomask_embeddings, axis=1)[..., None]
            normed_hidden_states = tf.nn.l2_normalize(concat_hidden_states, axis=-1)
            mask_cosine_similarity = tf.matmul(
                normed_hidden_states, normed_mask_embeddings
            )  # [B, T, 1]
            nomask_cosine_similarity = tf.matmul(
                normed_hidden_states, normed_nomask_embeddings
            )  # [B, T, 1]
            mask_attentions = tf.nn.sigmoid(10.0 * mask_cosine_similarity)  # [B, T, 1]
            nomask_attentions = tf.nn.sigmoid(10.0 * nomask_cosine_similarity)  # [B, T, 1]

            # average attention
            if use_only_mask:
                attentions = mask_attentions
            else:
                attentions = 0.5 * (mask_attentions + (1.0 - nomask_attentions))

            attentions *= attention_mask

            # compute mask and nomask embeddings
            mask_embeddings = self._compute_avg_embeddings(
                concat_hidden_states,
                tf.where(attention_mask == 0, 0.0, attentions),
                K=1,
            )
            nomask_embeddings = self._compute_avg_embeddings(
                concat_hidden_states,
                1.0 - tf.where(attention_mask == 0, 1.0, attentions),
                K=1,
            )
            return sequence_embeddings, mask_embeddings, nomask_embeddings, attentions

**Inference**

In [13]:
def find_all_start_end(attention_values):
    start_offset = {}
    current_idx = 0
    is_start = False
    start_end = []
    while current_idx < len(attention_values):
        if attention_values[current_idx] == 1 and is_start is False:
            start_offset[current_idx] = 0
            is_start = True
            start_idx = current_idx
        elif attention_values[current_idx] == 1 and is_start is True:
            start_offset[start_idx] += 1
        elif attention_values[current_idx] == 0 and is_start is True:
            is_start = False
        current_idx += 1
    for k, v in start_offset.items():
        start_end.append([k, k + v + 1])
    return start_end

In [14]:
def compute_cosine_similarity(x1, x2):
    x1_norm = tf.nn.l2_normalize(x1, axis=1)
    x2_norm = tf.nn.l2_normalize(x2, axis=1)
    cosine_similarity = tf.matmul(x1_norm, x2_norm, transpose_b=True)  # [B1, B2]
    return tf.clip_by_value(cosine_similarity, -1.0, 1.0)

In [15]:
def run_inference(test_dataloader, 
                  model, all_support_embeddings, 
                  all_support_mask_embeddings, 
                  all_support_nomask_embeddings, ner_threshold=[0.5, 0.7]):
    preds = []
    preds_low_confidence = []
    cosines = []
    ids = []
    text_ids = []
    inputs = []
    N_TTA = 100
    
    tokenizer = test_dataloader.tokenizer

    for query_batch in tqdm(test_dataloader):
        all_cosines = []
        support_embeddings = all_support_embeddings[
            np.random.choice(range(all_support_embeddings.shape[0]), 
                             size=query_batch["input_ids"].shape[0] * N_TTA)
        ]
        support_mask_embeddings = all_support_mask_embeddings[
            np.random.choice(range(all_support_mask_embeddings.shape[0]), 
                             size=query_batch["input_ids"].shape[0] * N_TTA)
        ]
        support_nomask_embeddings = all_support_nomask_embeddings[
            np.random.choice(range(all_support_nomask_embeddings.shape[0]), 
                             size=query_batch["input_ids"].shape[0] * N_TTA)
        ]
        support_mask_embeddings = np.mean(np.reshape(support_mask_embeddings, (-1, N_TTA, 768)), axis=1)
        support_nomask_embeddings = np.mean(np.reshape(support_nomask_embeddings, (-1, N_TTA, 768)), axis=1)
        query_embeddings, query_mask_embeddings, query_nomask_embeddings, attention_values = model(
            [
                query_batch["input_ids"],
                query_batch["attention_mask"],
            ],
            training=False,
            sequence_labels=None,
            mask_embeddings=support_mask_embeddings,
            nomask_embeddings=support_nomask_embeddings,
        )  # [B, F]
        cosine = compute_cosine_similarity(query_embeddings, support_embeddings).numpy()
        cosine = np.mean(cosine, axis=1)
        all_cosines.extend(cosine)
        ids.extend(query_batch["ids"])
        for k in range(len(all_cosines)):
            for TH in ner_threshold:
                binary_pred_at_th = attention_values.numpy()[k, :, 0] >= TH
                if np.sum(binary_pred_at_th) > 0:
                    binary_pred_at_th = binary_pred_at_th.astype(np.int32)
                    start_end = find_all_start_end(binary_pred_at_th)
                    pred_candidates = []
                    for s_e in start_end:
                        if (s_e[1] - s_e[0]) >= 4:
                            pred_tokens = list(range(s_e[0], s_e[1]))
                            pred = tokenizer.decode(query_batch["input_ids"][k, ...][pred_tokens])
                            pred_candidates.append(pred)
                    pred = "|".join(pred_candidates)
                else:
                    pred = ""
                if TH == 0.7:
                    preds.append(pred)
                else:
                    preds_low_confidence.append(pred)
            cosines.append(all_cosines[k])
    return ids, text_ids, inputs, cosines, preds, preds_low_confidence

In [16]:
def end2end(pretrained_path, checkpoint_path, test_df, ner_threshold=[0.5, 0.7]):
    config = AutoConfig.from_pretrained(
        f"/kaggle/input/{pretrained_path}/")
    config.output_attentions = True
    config.output_hidden_states = True

    main_model = TFAutoModel.from_config(config=config)
    model = MetricLearningModel(config=config, name="metric_learning_model")
    model.main_model = main_model
    model.K = 3
    
    # load pre-extract embedding
    checkpoint_path = f"/kaggle/input/{checkpoint_path}"
    all_support_embeddings = np.load(os.path.join(checkpoint_path, "support_embeddings.npy"))
    all_support_mask_embeddings = np.load(os.path.join(checkpoint_path, "support_mask_embeddings.npy"))
    all_support_nomask_embeddings = np.load(os.path.join(checkpoint_path, "support_nomask_embeddings.npy"))
    
    
    # create tokenizer and dataloader
    if "distil" in pretrained_path:
        tokenizer = DistilBertTokenizerFast.from_pretrained(f"/kaggle/input/{pretrained_path}/")
    elif "roberta" in pretrained_path:
        tokenizer = RobertaTokenizerFast.from_pretrained(f"/kaggle/input/{pretrained_path}/")
    elif "scibert" in pretrained_path:
        tokenizer = BertTokenizerFast.from_pretrained(f"/kaggle/input/{pretrained_path}/", do_lower_case=False)

    query_dataloader = QueryDataLoader(test_df, batch_size=128)
    test_dataloader = SupportQueryDataLoader(
        test_df,
        tokenizer=tokenizer,
        batch_size=128,
        is_train=False,
        training_steps=len(query_dataloader),
        query_dataloader=query_dataloader,
        return_query_ids=True,
    )
    
    # build model with real input and load_weights
    query_batch = test_dataloader.__getitem__(0)
    (
        query_embeddings,
        query_mask_embeddings,
        query_nomask_embeddings,
        attention_values,
    ) = model(
        [
            query_batch["input_ids"][:1, ...],
            query_batch["attention_mask"][:1, ...],
        ],
        training=True,
        sequence_labels=None,
        mask_embeddings=all_support_mask_embeddings[:1, ...],
        nomask_embeddings=all_support_nomask_embeddings[:1, ...],
    )  # [B, F]
    model.summary()
    weights_path = glob.glob(os.path.join(checkpoint_path, "*.h5"))[0]
    model.load_weights(weights_path, by_name=True)
    
    # apply tf.function
    model = tf.function(model, experimental_relax_shapes=True)
    
    # run inference
    ids, text_ids, inputs, cosines, preds, preds_low_confidence = run_inference(
        test_dataloader, 
        model, 
        all_support_embeddings, 
        all_support_mask_embeddings, 
        all_support_nomask_embeddings,
        ner_threshold=ner_threshold
    )
    
    # release model
    del_everything(model)
    
    return ids, text_ids, inputs, cosines, preds, preds_low_confidence, test_dataloader.tokenizer

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english')) 

def remove_stopwords(string):
    word_tokens = word_tokenize(string) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_sentence).strip()

In [18]:
def check_special_token(string, tokenizer):
    pad_token = tokenizer.pad_token
    sep_token = tokenizer.sep_token
    cls_token = tokenizer.cls_token
    
    if (pad_token not in string) and (sep_token not in string) and (cls_token not in string):
        return True
    return False

In [19]:
def clean_text(txt, lower=True):
    if lower:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
    else:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

In [20]:
def jaccard_similarity(str1, str2): 
    a = set(str1.lower().split(" "))
    b = set(str2.lower().split(" "))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [21]:
def find_cased_pred(lower_start_idx, lower_end_idx, lower_string, cased_string, lower_pred):
    len_lower_string = len(lower_string)
    len_cased_string = len(cased_string)
    if (len_lower_string - len_cased_string) == 0:
        return cased_string[lower_start_idx: lower_end_idx]
    else:
        diff_len = abs(len_lower_string - lower_end_idx)
        for shift_idx in range(-diff_len - 1, diff_len + 1):
            cased_pred_candidate = cased_string[lower_start_idx + shift_idx : lower_start_idx + shift_idx + len(lower_pred)]
            if cased_pred_candidate.lower() == lower_pred:
                return cased_pred_candidate
    return lower_pred.upper()


def calculate_iou(se_0, se_1):
    s_0, e_0 = se_0
    s_1, e_1 = se_1    
    max_s = max(s_0, s_1)
    min_e = min(e_0, e_1)
    intersection = (min_e - max_s)
    return  intersection / ((e_0 - s_0) + (e_1 - s_1) - intersection)


def find_all_pred_in_text(normed_text_cased, all_unique_preds):
    normed_text_cased = clean_text(normed_text_cased, False)
    normed_text = normed_text_cased.lower()
    preds = []
    preds_indexs = []
    for pred in all_unique_preds:
        if (" " + pred + " " in normed_text) or (" " + pred + "," in normed_text) or (" " + pred + "." in normed_text):
            preds.append(pred)
    unique_preds = [] # unique in terms of index. 
    preds = list(sorted(preds, key=len))
    for pred in preds:
        matchs = re.finditer(pred, normed_text)
        for match in matchs:
            start_index = match.start()
            end_index = match.end()
            pred_cased = find_cased_pred(start_index, end_index, normed_text, normed_text_cased, pred)
            if pred_cased.islower() is False:
                preds_indexs.append([start_index, end_index])
                unique_preds.append(pred)
    group_idxs = []
    for i in range(len(preds_indexs)):
        for j in range(len(preds_indexs)):
            if i != j:
                start_i, end_i = preds_indexs[i]
                start_j, end_j = preds_indexs[j]
                iou = calculate_iou(preds_indexs[i], preds_indexs[j])
                if (start_i <= end_j and end_i <= end_j and start_i >= start_j) or iou >= 0.1:
                    group_idxs.append([i, j])
    unique_preds = np.array(unique_preds)
    for group_idx in group_idxs:
        unique_preds[group_idx[0]] = unique_preds[group_idx[1]]
    return np.unique(unique_preds)

In [22]:
# remove acronym from prediction
def remove_acronym(preds):
    for i in range(len(preds)):
        pred_i = preds[i]
        pred_i = pred_i.replace("( ", "(")
        pred_i = pred_i.replace(" )", ")")
        pred_i = pred_i.replace("[ ", "[")
        pred_i = pred_i.replace(" ]", "]")
        try:
            new_pred_i = []
            for pi in pred_i.split("|"):
                if pi != "":
                    words = pi.split()
                    if "(" in words[-1] or "[" in words[-1]:
                        new_pred_i.append(" ".join(words[:-1]))
                    else:
                        new_pred_i.append(pi)
            new_pred_i = "|".join(new_pred_i)
            preds[i] = new_pred_i
        except:
            pass
    return preds

In [23]:
def remove_overlap(preds, preds_low_confidence):
    for i in range(len(preds_low_confidence)):
        if preds[i] == "" or preds_low_confidence[i] == "":
            continue
        pred_i = preds[i].split("|")
        pred_low_conf_i = preds_low_confidence[i].split("|")
        new_p_low = []
        for p_low in pred_low_conf_i:
            overlap = False
            for p in pred_i:
                if p in p_low:
                    overlap = True
                    break
            if overlap is False:
                new_p_low.append(p_low)
        if len(new_p_low) == 0:
            preds_low_confidence[i] = ""
        else:
            preds_low_confidence[i] = "|".join(new_p_low)
    return preds_low_confidence

In [24]:
def check_valid_low_confidence_pred(pred):
    clean_pred = clean_text(pred, True)
    keywords = ["study", "survey", "studies", "database", "dataset", "data system", "system data", "data set", "data base", "program"]
    if pred != "":
        words = pred.strip().split()
        clean_words = clean_pred.strip().split()
        string_check= re.compile('[\(\)\[\]]')
        if clean_words[0] in ["a", "an", "the"]:
            return False
        if clean_words[-1] in ["a", "an", "the", "in", "on", "of", "for", "and", "or"]:
            return False
        if words[0][0].isalpha() and words[0][0].isupper() and string_check.search(words[0]) is None:
            for kw in keywords:
                if kw in clean_pred:
                    return True
    return False

In [25]:
# create text per id
raw_text_per_id = {}
clean_text_per_id = {}
all_unique_ids = unique_ids

for i in tqdm(range(len(all_unique_ids)), desc="Create raw text per id"):
    full_text = full_texts[i]
    if id not in raw_text_per_id:
        raw_text_per_id[all_unique_ids[i]] = full_text
        clean_text_per_id[all_unique_ids[i]] = clean_text(full_text).strip()

Create raw text per id: 100%|██████████| 4/4 [00:00<00:00, 145.34it/s]


In [26]:
# Get all accepted preds
def get_accepted_preds(preds, preds_low_confidence, cosines, cosine_threshold, tokenizer):
    accepted_preds = []
    ########################################################
    all_accepted_preds = []
    for i in range(len(preds)):
        if cosines[i] >= cosine_threshold:
            a = preds[i].split("|")
            unique_v = np.unique(a)
            all_accepted_preds.extend(unique_v)
        else:
            preds_low_confidence_i = preds_low_confidence[i].split("|")
            preds_low_confidence_i.extend(preds[i].split("|"))
            preds_low_confidence[i] = "|".join(preds_low_confidence_i)
            
            
    counter_all_accepted_preds = Counter(all_accepted_preds)
    for k, v in counter_all_accepted_preds.items():
        k = k.strip()
        if ("#" not in k) and len(clean_text(k).strip().split(" ")) >= 3 and len(k.split(" ")) >= 3 and len(remove_stopwords(k).split(" ")) >= 3 and len(k) >= 10 and check_special_token(k, tokenizer):
            if v >= 4:
                accepted_preds.append(clean_text(k).strip())
            else:
                if check_valid_low_confidence_pred(k):
                    accepted_preds.append(clean_text(k).strip())

    ########################################################
    all_accepted_preds = []
    for i in range(len(preds_low_confidence)):
        if cosines[i] >= -1.0:
            a = preds_low_confidence[i].split("|")
            unique_v = np.unique(a)
            all_accepted_preds.extend(unique_v)
    counter_all_accepted_preds = Counter(all_accepted_preds)
    for k, v in counter_all_accepted_preds.items():
        k = k.strip()
        if ("#" not in k) and len(clean_text(k).strip().split(" ")) >= 3 and len(k.split(" ")) >= 3 and len(remove_stopwords(k).split(" ")) >= 3 and len(k) >= 10 and check_special_token(k, tokenizer):
            if check_valid_low_confidence_pred(k):
                accepted_preds.append(clean_text(k).strip())

    accepted_preds = list(set(accepted_preds))
    return accepted_preds

In [27]:
accepted_preds = []


PARAMS = [
    ("pretrainedbiomedrobertabase", "coleridgeinitiativebiomedrobertabasev2", [0.5, 0.7], -0.1),
    ("scibertbasecased", "coleridgeinitiativescibertbasecasedv10", [0.5, 0.7], -0.7),
]

for i, param in enumerate(PARAMS):
    ids, text_ids, inputs, cosines, preds, preds_low_confidence, tokenizer = end2end(
        param[0], 
        param[1], 
        test_df,
        ner_threshold=param[2])

    preds = remove_acronym(preds)
    preds_low_confidence = remove_acronym(preds_low_confidence)
    preds_low_confidence = remove_overlap(preds, preds_low_confidence)
    accepted_preds.extend(get_accepted_preds(preds, preds_low_confidence, cosines, param[3], tokenizer))

Model: "metric_learning_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  590592    
_________________________________________________________________
tf_roberta_model (TFRobertaM multiple                  124645632 
Total params: 125,236,224
Trainable params: 125,236,224
Non-trainable params: 0
_________________________________________________________________


100%|██████████| 3/3 [00:16<00:00,  5.35s/it]


Model: "metric_learning_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  590592    
_________________________________________________________________
tf_bert_model (TFBertModel)  multiple                  109938432 
Total params: 110,529,024
Trainable params: 110,529,024
Non-trainable params: 0
_________________________________________________________________


100%|██████████| 3/3 [00:09<00:00,  3.16s/it]


In [28]:
accepted_preds = list(set(accepted_preds))

**Make Submission**

In [29]:
accepted_preds

['nc coastal erosion study',
 'trends in international mathematics and science study',
 'consumer expenditure survey',
 'progress in international reading literacy study',
 'sea level rise risk management study',
 'coastal erosion study',
 'international mathematics and science study',
 'national health and nutrition examination survey national food']

In [30]:
group_label_per_id = {}

for i in tqdm(range(len(all_unique_ids))):
    full_text = raw_text_per_id[all_unique_ids[i]]
    merged_pred_labels = find_all_pred_in_text(full_text, np.unique(accepted_preds))
    group_label_per_id[all_unique_ids[i]] = []
    group_label_per_id[all_unique_ids[i]].extend(merged_pred_labels)

100%|██████████| 4/4 [00:00<00:00, 125.91it/s]


In [31]:
def find_valid_ac(long_form, short_form):
    long_form = "".join([w[0] for w in long_form.split()])
    short_form_candidate1 = "".join(
        [w if i == 0 else w[0] for i, w in enumerate(short_form.split())]
    )
    short_form_candidate2 = short_form.split()[0]
    short_form_accepted = None
    original_long_index = len(long_form) - 1
    for i, short_form_candidate in enumerate([short_form_candidate1, short_form_candidate2]):
        long_index = len(long_form) - 1
        short_index = len(short_form_candidate) - 1

        while short_index >= 0:
            current_charactor = short_form_candidate[short_index]
            if not current_charactor.isalpha():
                short_index -= 1
                continue

            while long_form[long_index] != current_charactor:
                long_index -= 1
                if long_index < 0:
                    break

            short_index -= 1
            if long_index < 0:
                break
                
        if long_index >= 0 and (not short_form.isdigit()) and long_index < original_long_index:
            if i == 0:
                short_form_accepted = short_form
            else:
                short_form_accepted = short_form.split()[0]
                
            if not (short_form_accepted[-1].isalpha() or short_form_accepted[-1].isdigit()):
                short_form_accepted = short_form_accepted[:-1]
            return short_form_accepted

    return short_form_accepted

In [32]:
def clean_text_v2(txt):
    return re.sub('[^A-Za-z0-9\(\)\[\]]+', ' ', str(txt).lower())

In [33]:
def find_all_acronyms_candidates(group_label_per_id):
    for k in group_label_per_id.keys():
        string = clean_text_v2(raw_text_per_id[k])
        all_labels = group_label_per_id[k].split("|")
        for label in all_labels:
            if label != "":
                acronyms_candidates = re.findall(f"{label} \((.*?)\)", string)
                acronyms_candidates.extend(re.findall(f"{label} \[(.*?)\]", string))
                acronyms_candidates = np.unique([ac for ac in acronyms_candidates if len(ac.split()) >= 1])
                if len(acronyms_candidates) > 0:
                    for ac in acronyms_candidates:
                        ac = find_valid_ac(label, ac)
                        if ac is not None:
                            if len(ac.split(" ")) <= 2:
                                group_label_per_id[k] += f"|{ac}"
    return group_label_per_id

In [34]:
for k, v in group_label_per_id.items():
    unique_v = list(np.unique(v))
    if len(unique_v) >= 2:
        group_label_per_id[k] = "|".join([v for v in unique_v if v != ''])
    elif len(unique_v) == 1 and unique_v[0] == '':
        group_label_per_id[k] = ''
    elif len(unique_v) == 1 and unique_v[0] != '':
        group_label_per_id[k] = f'{unique_v[0]}'
    else:
        group_label_per_id[k] = ''

In [35]:
group_label_per_id = find_all_acronyms_candidates(group_label_per_id)

In [36]:
y_pred = []
y_ids = []
for k in list(group_label_per_id.keys()):
    pred = []
    pred.extend(group_label_per_id[k].split("|"))
    pred = np.unique(pred)
    accepted_pred = []
    for i in range(len(pred)):
        clean_pred = clean_text(pred[i])
        pred[i] = clean_pred.strip()
        accepted_pred.append(pred[i])
    y_pred.append(list(pred))
    y_ids.append(k)

In [37]:
y_pred_merged = []
for pred in y_pred:
    pred = "|".join(pred).strip()
    y_pred_merged.append(pred)

In [38]:
submission = pd.DataFrame()
submission['Id'] = y_ids
submission['PredictionString'] = y_pred_merged
submission.to_csv("submission.csv",index=False)

In [39]:
!head submission.csv

Id,PredictionString
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,ces|consumer expenditure survey|national health and nutrition examination survey national food
2100032a-7c33-4bff-97ef-690822c43466,
2f392438-e215-4169-bebf-21ac4ff253e1,pirls|pirls 2006|progress in international reading literacy study|timss|timss 2007|trends in international mathematics and science study
3f316b38-1a24-45a9-8d8c-4e05a42257c6,coastal erosion study|nc coastal erosion study|sea level rise risk management study|slrrms
