In [1]:
import sys
sys.path.append("../code")
from dataset import build_dataset, build_dataloader
from config import DataArguments, TrainerArguments, ModelArguments

In [4]:
data_args = DataArguments()
training_args = TrainerArguments()
model_args = ModelArguments()

data_args.data_dir = "../data/"
data_args.asset_dir = "../assets/"
model_args.asset_dir = data_args.asset_dir
data_args.seed = training_args.seed
data_args.max_seq_len = model_args.max_seq_len

data_args.init_pct = 1

train_dataset, model_args.vocab_size, model_args.num_labels = build_dataset(data_args, "train")
# train_dataloader = build_dataloader(train_dataset, data_args)

[11/23/2021 14:31:45] INFO - dataset: Initialize Train Dataset.
[11/23/2021 14:31:45] INFO - dataset: Remove abstract.
[11/23/2021 14:31:45] INFO - dataset: Remove task_id
[11/23/2021 14:31:45] INFO - dataset: Use the full dataset, for train dataset of total 47250 papers.
[11/23/2021 14:31:45] INFO - dataset: Train dataset was successfully initialized.
[11/23/2021 14:31:47] INFO - preprocess: Successfully loaded Spacy Tokenizer, en_core_web_trf
[11/23/2021 14:31:47] INFO - dataset: Successfully loaded mapper file ..\assets\area2idx.json
100%|██████████| 48/48 [31:09<00:00, 38.94s/ba]
[11/23/2021 15:03:01] INFO - dataset: train dataset was properly preprocessed.


In [5]:
preprocessed_data_dir = "../data/tokenized_paperswtihcode/"

train_dataset.save_to_disk(f"{preprocessed_data_dir}/train")

In [None]:
train_dataset

In [6]:
valid_dataset, model_args.vocab_size, model_args.num_labels = build_dataset(data_args, "valid")
test_dataset, model_args.vocab_size, model_args.num_labels = build_dataset(data_args, "test")

[11/23/2021 15:48:15] INFO - dataset: Initialize Valid Dataset.
[11/23/2021 15:48:15] INFO - dataset: Remove abstract.
[11/23/2021 15:48:15] INFO - dataset: Remove task_id
[11/23/2021 15:48:15] INFO - dataset: Use the full dataset, for valid dataset of total 2625 papers.
[11/23/2021 15:48:15] INFO - dataset: Valid dataset was successfully initialized.
[11/23/2021 15:48:17] INFO - preprocess: Successfully loaded Spacy Tokenizer, en_core_web_trf
[11/23/2021 15:48:17] INFO - dataset: Successfully loaded mapper file ..\assets\area2idx.json
100%|██████████| 3/3 [01:45<00:00, 35.02s/ba]
[11/23/2021 15:50:05] INFO - dataset: valid dataset was properly preprocessed.
[11/23/2021 15:50:05] INFO - dataset: Initialize Test Dataset.
[11/23/2021 15:50:05] INFO - dataset: Remove abstract.
[11/23/2021 15:50:05] INFO - dataset: Remove task_id
[11/23/2021 15:50:05] INFO - dataset: Use the full dataset, for test dataset of total 2625 papers.
[11/23/2021 15:50:05] INFO - dataset: Test dataset was successf

In [8]:
valid_dataset.save_to_disk(f'{preprocessed_data_dir}/valid')
test_dataset.save_to_disk(f'{preprocessed_data_dir}/test')

In [15]:
import numpy as np
confidence_level = np.random.randint(low=0, high=1, size=(2625,))

In [98]:
from torch.utils.data.dataloader import DataLoader

valid_dl = DataLoader(valid_dataset, batch_size=16, pin_memory=True)

In [97]:
confidence_level = np.random.randint(0, 20, 10)
idx = confidence_level.argsort()
pool_idx = idx[:7]
acquired_idx = idx[7:]

In [20]:
def add_confidence_level(x, new_info_x):

    x["confidence_level"] = new_info_x
    return x

In [93]:
valid_dataset.add_column(name="confidence_level", column=confidence_level)

Dataset({
    features: ['input_ids', 'labels', 'confidence_level'],
    num_rows: 2625
})

In [42]:
import numpy as np
from sklearn.metrics import accuracy_score

logit = np.array([
    [0.1, 0.1, 0.8],
    [0.2, 0.7, 0.1],
])
ans = np.array([2, 1])

In [49]:
import math
import numpy as np
from scipy.stats import entropy

import torch


def check_torch(logits):

    if isinstance(logits, torch.tensor):
        logits = logits.numpy()
    return logits


def least_confidence(logits):

    logits = check_torch(logits)

    most_conf = np.nanmax(logits, axis=1)
    num_labels = logits.shape[1]
    numerator = num_labels * (1 - most_conf)
    denominator = num_labels - 1
    return numerator / denominator


def margin_of_confidence(logits: np.ndarray):

    logits = check_torch(logits)

    part = np.partition(-logits, 1, axis=1)
    margin = -part[:, 0] + part[:, 1]
    return margin

def entropy(logits, dim: int, keepdim: bool = False):
    return -torch.sum((torch.exp(logits) * logits).double(), dim=dim, keepdim=keepdim)



def logit_mean(logits, dim: int, keepdim: bool = False):
    r"""Computes $\log \left ( \frac{1}{n} \sum_i p_i \right ) =
    \log \left ( \frac{1}{n} \sum_i e^{\log p_i} \right )$.
    We pass in logits.
    """
    return torch.logsumexp(logits, dim=dim, keepdim=keepdim) - math.log(
        logits.shape[dim]
    )


def mutual_information(logits_B_K_C):
    sample_entropies_B_K = entropy(logits_B_K_C, dim=-1)
    entropy_mean_B = torch.mean(sample_entropies_B_K, dim=1)

    logits_mean_B_C = logit_mean(logits_B_K_C, dim=1)
    mean_entropy_B = entropy(logits_mean_B_C, dim=-1)

    mutual_info_B = mean_entropy_B - entropy_mean_B
    return mutual_info_B

    
def bald_acquisition(logits):

    return mutual_information(logits)

In [44]:
least_confidence(logit)

array([0.3 , 0.45])

In [45]:
margin_of_confidence(logit)

array([0.7, 0.5])

In [47]:
def entropy(logits):

    return (logits * np.log1p(logits)).sum(axis=1)

entropy(logit)

array([0.48929137, 0.41743511])

In [52]:
logit_t = torch.tensor(logit)
logit_t += 1

In [90]:
dir(train_dataset)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_index_is_initialized',
 '_data',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_split',
 'add_column',
 'add_elasticsearch_index',
 'add_faiss_index',
 'add_faiss_index_from_external_arrays',
 'add_item',
 'align_labels_with_mapping',
 'builder_name',
 'cache_files',
 'cast',
 'cast_',
 'cast_column',


In [89]:
idx = np.random.randint(0, len(train_dataset), 2625)
train_dataset[idx]

{'input_ids': [[0,
   43043,
   5137,
   163,
   5003,
   11,
   272,
   1889,
   12,
   805,
   5423,
   2049,
   36197,
   13,
   7090,
   1960,
   12349,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   24514,
   28376,
   12,
   20930,
   28688,
   1938,
   15,
   32225,
   268,
   710,
   40289,
   35,
   2647,
   12,
   510,
   7878,
   1825,
   8,
   30750,
   12,

In [80]:
torch.permute(logits, (0, 1)).size()

AttributeError: module 'torch' has no attribute 'permute'

In [69]:
bald_acquisition(logits)

tensor([0.4476, 0.4476, 0.4476, 0.4476], dtype=torch.float64)

In [84]:
logits.permute((1, 0, 2))

torch.Size([2, 4, 3])