### Prepare the notebook to have access to all files

In [1]:
import os
import sys
# get the project root
__filename__ = os.path.abspath(".")
__dirname__ = os.path.dirname(__filename__)
if __dirname__ not in sys.path:
    sys.path.append(__dirname__)

### Visualization and process library

In [2]:
import random
import pandas as pd
from tqdm import notebook
tqdmn = notebook.tqdm

### Debugging Tool

In [3]:
from importlib import reload

# Data Loading

In [4]:
from library.data_loader import get_test_datasets

In [5]:
test_data = {
    "data_en_de": None,
    "data_en_fr": None,
    "data_en_tl": None,
}

In [6]:
for key in test_data.keys():
    test_data[key] = get_test_datasets(key)

# Model Libraries

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
# import transformers
from transformers import BertTokenizer
# import datasets
from datasets import Dataset
# import the NEW method
import library.bert_mean

In [8]:
reload(library.bert_mean)

<module 'library.bert_mean' from '/home/erikn/Documents/code/papers/Paper-2021-IPR/library/bert_mean.py'>

In [9]:
# set the device on which we will train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# get the bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [11]:
PATH = '../data/models/bert_mean_sasaki18.pth'

In [12]:
# load the NEW method used for CLIR
model = library.bert_mean.BERT_MEAN()
model.load_state_dict(torch.load(PATH))
model = model.eval().to(device)

In [13]:
def format_evaluation(d):
    return {
        "model": d["model"],
        "q_lang": d["q_lang"],
        "d_lang": d["d_lang"],
        "P@1": d["P@1"],
        "MAP": d["MAP"],
    }

In [14]:
def evaluate(batch):
    q_inputs = tokenizer(batch['query'], truncation=True, padding=True, return_tensors='pt')
    d_inputs = tokenizer(batch['documents'], truncation=True, padding=True, return_tensors='pt')
    # get the input batches
    examples = { 
        'q_input_ids': q_inputs['input_ids'],
        'q_attention_mask': q_inputs['attention_mask'],
        'q_token_type_ids': q_inputs['token_type_ids'],
        'd_input_ids': d_inputs['input_ids'],
        'd_attention_mask': d_inputs['attention_mask'],
        'd_token_type_ids': d_inputs['token_type_ids'],
    }
    # move the batch tensors to the same device as the 
    examples = { k: v.to(device) for k, v in examples.items() }
    # send 'input_ids', 'attention_mask' and 'labels' to the model
    # the outputs are of shape (loss, distances)
    outputs = model(**examples)
    return outputs

In [15]:
def get_average_precision_at_k(params, model):
    # get parameters for calculation
    labels = torch.Tensor(params["relevance"])
    # get the loss values
    with torch.no_grad():
        outputs = evaluate(params)
        # sort the instances
        distances = outputs[0].detach().cpu()
        # delete the outputs
        del outputs
    # Make sure deallocation has taken place
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    # get the sort indices
    sort_indices = distances.argsort()
    # sort the labels values based on the similarity order
    RelAtK = labels[sort_indices]
    # get the cummulative sum over the whole labels list
    cum_labels = RelAtK.cumsum(dim=0)
    # calculate the precision at k value over the whole list
    PatK = cum_labels * torch.Tensor([1 / (i + 1) for i in range(labels.shape[0])])
    # Group Truth Positives
    GTP = RelAtK.sum()
    # Average Precision for given query
    AveP = (1 / GTP * (RelAtK * PatK).sum())
    return { "P@1": PatK[0].item(), "AveP": AveP.item() }

In [16]:
# set a placeholder for evaluation results
def lm_process_dataset(dataset, d_lang, model_type, model):
    values = []
    # set a placeholder for evaluation results
    for example in tqdmn(dataset):
        query = example["query"]
        documents = example["documents"]
        relevance = example["relevance"]
        
        index_shuffle = list(range(len(documents)))
        random.shuffle(index_shuffle)
        
        documents = [documents[index_shuffle[i]] for i in range(len(documents))]
        relevance = [relevance[index_shuffle[i]] for i in range(len(relevance))]
        
        params = {
            "query": query,
            "documents": documents,
            "relevance": relevance,
        }
        values.append(get_average_precision_at_k(params, model))

    # return the evaluation results
    return [{
        "q_lang": "en",
        "d_lang": d_lang,
        "P@1": sum([v["P@1"] for v in values]) / len(values),
        "MAP": sum([v["AveP"] for v in values]) / len(values),
        "model": model_type
    }]

In [None]:
bert_evaluation_de = lm_process_dataset(test_data["data_en_de"], "de", "mbert", model)

HBox(children=(FloatProgress(value=0.0, max=42021.0), HTML(value='')))

In [None]:
pd.DataFrame([format_evaluation(bert_evaluation_de[0])])

In [None]:
bert_evaluation_fr = lm_process_dataset(test_data["data_en_fr"], "fr", "mbert", model)

In [None]:
pd.DataFrame([format_evaluation(bert_evaluation_fr[0])])

In [None]:
bert_evaluation_tl = lm_process_dataset(test_data["data_en_tl"], "tl", "mbert", model)

In [None]:
pd.DataFrame([format_evaluation(bert_evaluation_tl[0])])

In [None]:
bert_evaluation = bert_evaluation_de + bert_evaluation_fr + bert_evaluation_tl

In [None]:
pd.DataFrame([format_evaluation(e) for e in bert_evaluation])