In [20]:
import os
import sys
# get the project root
__filename__ = os.path.abspath(".")
__dirname__ = os.path.dirname(__filename__)
if __dirname__ not in sys.path:
    sys.path.append(__dirname__)

In [2]:

from importlib import reload

In [3]:
import json
from tqdm import notebook
tqdmn = notebook.tqdm

In [77]:
import pandas as pd
import torch
import random

# Loading the data sets

In [5]:
import re
import numpy as np

In [6]:
def open_training_set(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.readlines()

In [7]:
def format_row(string):
    splitted_values = re.split(r'\t+', string)
    
    if len(splitted_values) == 3:
        rel, query, document = splitted_values
        return { "query": query.strip(), "documents": document.strip(), "relevance": rel.strip() }
    else:
        return None

In [8]:
def format_dataset(array):
    return [format_row(row) for row in array]

In [64]:
def prepare_dataset(filepath):   
    file = open_training_set(filepath)
    file = format_dataset(file)
    
    dataset = {}
    dataset["query"] = [el["query"] for el in file if el]
    dataset["documents"] = [el["documents"] for el in file if el]
    dataset["relevance"] = [el["relevance"] for el in file if el]
    return dataset

In [29]:
def get_train_datasets(datatype):
    # prepare the dataset paths
    train_path = f"../../data/clir/sasaki18/{datatype}/train.txt"
    # load the datasets
    return prepare_dataset(train_path)

In [30]:
train_data = {
    "data_en_de": None,
    "data_en_fr": None,
    "data_en_tl": None,
}

In [31]:
for key in data.keys():
    train_data[key] = get_train_datasets(key)

## load the Word Embeddings

In [23]:
import library.text_embeds as TE

In [26]:
# prepare language mapping
language_mapping = {
    "de": "german",
    "en": "english",
    "fr": "french",
    "tl": "tagalog"
}

In [27]:
models = {}

In [28]:
for language in tqdmn(language_mapping.keys(), desc="Language Models: "):
    model_path = __dirname__ + '/data/fasttext/wiki.{}.align.vec'.format(language)
    # append the language model
    models[language] = TE.TextEmbedding(language_mapping[language], model_path)

HBox(children=(FloatProgress(value=0.0, description='Language Models: ', max=4.0, style=ProgressStyle(descript…




### Update the models with the training set vocabulary

In [32]:
models["de"].add_vocabulary_corpus(train_data["data_en_de"]["documents"])
models["fr"].add_vocabulary_corpus(train_data["data_en_fr"]["documents"])
models["tl"].add_vocabulary_corpus(train_data["data_en_tl"]["documents"])

# Load the Test Datasets

In [45]:
# import datasets
from datasets import Dataset

In [33]:
def get_test_datasets(datatype):
    # prepare the dataset paths
    test_path = f"../../data/clir/sasaki18/{datatype}/test1.txt"
    # load the datasets
    return prepare_dataset(test_path)

In [65]:
test_data = {
    "data_en_de": None,
    "data_en_fr": None,
    "data_en_tl": None,
}

In [66]:
for key in data.keys():
    test_data[key] = get_test_datasets(key)

In [67]:
batch_size = 40

In [68]:
def format_evaluation(d):
    return {
        "q_lang": d["q_lang"],
        "d_lang": d["d_lang"],
        "P@1": d["AveP@1"],
        "MAP": d["MAP"],
        "model": d["model"]
    }

## Evaluate WE with Weights

In [72]:
def get_average_precision_at_k_weight(params):    
    # get parameters
    query = params["query"]
    documents = params["documents"]
    relevance = params["relevance"]
    q_lang = params["q_lang"]
    d_lang = params["d_lang"]
    d_weight = params["d_weight"]
    
    # shuffle the indices
    index_shuffle = list(range(len(documents)))
    random.shuffle(index_shuffle)
    documents = [documents[index_shuffle[i]] for i in range(len(documents))]
    relevance = [relevance[index_shuffle[i]] for i in range(len(relevance))]
    
    # get query vector (normalized)
    q_vector = models[q_lang].text_embedding(query, weight="tf", normalize=True)
    q_vector = np.array(q_vector)
    
    # get documents vectors (normalized)
    d_vectors = [models[d_lang].text_embedding(d, weight=d_weight, normalize=True) for d in documents]
    d_vectors = np.array(d_vectors)
    
    #  get the cosine similarity between documents and vectors
    cosine_similarity = np.matmul(d_vectors, q_vector)
    sort_indices = np.argsort(cosine_similarity)[::-1]
    
    # sort the relevance values based on the similarity order
    sort_rel = np.array([relevance[ind] for ind in sort_indices])
    
    # get the cummulative sum over the whole relevance list
    cum_rel = np.cumsum(sort_rel)
    # calculate the precision at k value over the whole list
    PatK = [sort_rel[idx] / (idx + 1) for idx in range(0, len(relevance))]
    
    # Group Truth Positives
    GTP = sum(sort_rel)
    # Average Precision for given query
    AveP = 1 / GTP * sum(sort_rel * PatK)
    
    # return the Precision@1 and Average Precision
    return { "P@1": PatK[0], "AveP": AveP, "P@k": PatK }

In [79]:
def we_weight_process_dataset(dataset, d_lang):
    
    # setup the vocabulary of the models
    data = Dataset.from_dict(dataset)
    # prepare the dataset loader
    data = torch.utils.data.DataLoader(data, batch_size=batch_size)
    
    # set a placeholder for evaluation results
    evaluation = []
    
    for d_weight in ["tf", "idf", "tfidf"]:
        values = []
        for example in tqdmn(data, desc=f"WE model={d_weight}"):
            params = {
                "query": example["query"][0],
                "documents": example["documents"],
                "relevance": example["relevance"],
                "q_lang": "en",
                "d_lang": d_lang,
                "d_weight": d_weight
            }
            values.append(get_average_precision_at_k_weight(params))
        
        evaluation.append({
            "q_lang": "en",
            "d_lang": d_lang,
            "AveP@1": sum([v["P@1"] for v in values]) / len(values),
            "MAP": sum([v["AveP"] for v in values]) / len(values),
            "model": d_weight
        })
        
    # return the evaluation results
    return evaluation

In [80]:
we_weight_evaluation_de = we_weight_process_dataset(test_data["data_en_de"], "de")

HBox(children=(FloatProgress(value=0.0, description='WE model=tf', max=42021.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='WE model=idf', max=42021.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='WE model=tfidf', max=42021.0, style=ProgressStyle(descrip…




In [83]:
we_weight_evaluation_fr = we_weight_process_dataset(test_data["data_en_fr"], "fr")

HBox(children=(FloatProgress(value=0.0, description='WE model=tf', max=54196.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='WE model=idf', max=54196.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='WE model=tfidf', max=54196.0, style=ProgressStyle(descrip…




In [85]:
we_weight_evaluation_tl = we_weight_process_dataset(test_data["data_en_tl"], "tl")

HBox(children=(FloatProgress(value=0.0, description='WE model=tf', max=2359.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='WE model=idf', max=2359.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='WE model=tfidf', max=2359.0, style=ProgressStyle(descript…




In [86]:
we_weight = we_weight_evaluation_de + we_weight_evaluation_fr + we_weight_evaluation_tl

In [87]:
pd.DataFrame([format_evaluation(e) for e in we_weight])

Unnamed: 0,q_lang,d_lang,P@1,MAP,model
0,en,de,0.736655,0.835902,tf
1,en,de,0.743961,0.840405,idf
2,en,de,0.763785,0.854061,tfidf
3,en,fr,0.717581,0.823263,tf
4,en,fr,0.718023,0.822198,idf
5,en,fr,0.753598,0.845904,tfidf
6,en,tl,0.589657,0.704402,tf
7,en,tl,0.59432,0.711674,idf
8,en,tl,0.597287,0.710278,tfidf


## Evaluate WE with EMD

In [94]:
import ot

In [95]:
def wmd(model1, model2, text1, text2):
    # get the tokens and their appearances
    tokens1 = model1.tokenize(text1)
    tokens2 = model2.tokenize(text2)
    
    # get the token distributions
    values1 = [value for key, value in tokens1 if key in model1.embedding]
    values2 = [value for key, value in tokens2 if key in model2.embedding]
    
    if len(values1) == 0 or len(values2) == 0:
        return 100
    
    # get embeddings
    keys1 = [key for key, value in tokens1]
    keys2 = [key for key, value in tokens2]
    embeds_text1 = model1.word_embeddings(keys1)
    embeds_text2 = model2.word_embeddings(keys2)
    
    # calculate cost matrix
    cost_matrix = np.matmul(embeds_text1, embeds_text2.T)
    cost_matrix = np.ones(cost_matrix.shape) - cost_matrix
    
    dist_text1 = np.array(values1) / sum(values1)
    dist_text2 = np.array(values2) / sum(values2)
    
    return ot.emd2(dist_text1, dist_text2, cost_matrix)

In [99]:
def get_average_precision_at_k_emd(params):
    
    # get parameters
    query = params["query"]
    documents = params["documents"]
    relevance = params["relevance"]
    q_lang = params["q_lang"]
    d_lang = params["d_lang"]
    
    # shuffle the indices
    index_shuffle = list(range(len(documents)))
    random.shuffle(index_shuffle)
    
    s_documents = []
    s_relevance = []
    for i in index_shuffle:
        s_documents.append(documents[i])
        s_relevance.append(1 if int(relevance[i]) > 0 else 0)
    
    # get similarity
    similarity = np.array([wmd(models[q_lang], models[d_lang], query, d) for d in s_documents])
    sort_indices = np.argsort(similarity)
    
    # sort the relevance values based on the similarity order
    sort_rel = np.array([s_relevance[ind] for ind in sort_indices])
    
    # get the cummulative sum over the whole relevance list
    cum_rel = np.cumsum(sort_rel)
    # calculate the precision at k value over the whole list
    PatK = [sort_rel[idx] / (idx + 1) for idx in range(0, len(s_relevance))]
    
    # Group Truth Positives
    GTP = sum(sort_rel)
    # Average Precision for given query
    AveP = 1 / GTP * sum(sort_rel * PatK)
    
    # return the Precision@1 and Average Precision
    return { "P@1": PatK[0], "AveP": AveP, "P@k": PatK }

In [100]:
def we_emd_process_dataset(dataset, d_lang):
    
    # setup the vocabulary of the models
    data = Dataset.from_dict(dataset)
    # prepare the dataset loader
    data = torch.utils.data.DataLoader(data, batch_size=batch_size)
    
    values = []
    for example in tqdmn(data, desc=f"WE model=EMD"):
        params = {
            "query": example["query"][0],
            "documents": example["document"],
            "relevance": example["rel"],
            "q_lang": "en",
            "d_lang": d_lang,
        }
        values.append(get_average_precision_at_k_emd(params))
        
    return [{
        "q_lang": "en",
        "d_lang": d_lang,
        "MAP": sum([v["AveP"] for v in values]) / len(values),
        "AveP@1": sum([v["P@1"] for v in values]) / len(values),
        "model": "emd",
    }]
        
    # return the evaluation results
    return evaluation

In [102]:
we_emd_evaluation_de = we_emd_process_dataset(test_data["data_en_de"], "de")

HBox(children=(FloatProgress(value=0.0, description='WE model=EMD', max=42021.0, style=ProgressStyle(descripti…




In [104]:
we_emd_evaluation_fr = we_emd_process_dataset(test_data["data_en_fr"], "fr")

HBox(children=(FloatProgress(value=0.0, description='WE model=EMD', max=54196.0, style=ProgressStyle(descripti…




In [106]:
we_emd_evaluation_tl = we_emd_process_dataset(test_data["data_en_tl"], "tl")

HBox(children=(FloatProgress(value=0.0, description='WE model=EMD', max=2359.0, style=ProgressStyle(descriptio…




In [108]:
we_emd = we_emd_evaluation_de + we_emd_evaluation_fr + we_emd_evaluation_tl

In [109]:
pd.DataFrame([format_evaluation(e) for e in we_emd])

Unnamed: 0,q_lang,d_lang,P@1,MAP,model
0,en,de,0.867281,0.91934,emd
1,en,fr,0.828216,0.894646,emd
2,en,tl,0.606189,0.722897,emd
