# 6. Evaluation

How does our system perform matching tokens?

## Imports

In [32]:
from glob import glob
from tqdm.auto import tqdm
import json
import numpy as np
import os
import sklearn.metrics
import statistics
import itertools
import multiprocessing
import pandas as pd

from pathlib import Path
from scipy.spatial import cKDTree

In [3]:
from picto2vec.testset import TestSet

In [1]:
from constants import *

## Filtering lexunits

For some reason, I retrieved more senses than we should have. Don't ask. Therefore, I'll make a list of all allowed senses, and we'll see from there.

In [4]:
legal_lexunits = []
with open("test_senses.json", "rt") as reader:
    senses = json.loads(reader.read())

for record in senses:
    if record["lexunit"] is None or record["lexunit"] == False:
        continue
    
    legal_lexunits += record["lexunit"].split(",")

For each sense, we get all medoids and associate their representations with their head sense.

In [7]:
def create_vector_list(medoid_count, layer_index):
    vectors = []
    names = []
    
    layer_path = f"{VECTORS}/medoid_{medoid_count}/layer_{layer_index}/*"
    sense_file_list = glob(layer_path)
    
    for sense_file in tqdm(sense_file_list):
        if Path(sense_file).stem not in legal_lexunits:
            continue
        
        with open(sense_file, "rt") as reader:
            sense_vectors = json.loads(reader.read())
            if len(sense_vectors) != medoid_count:
                raise Exception(f"Sense vectors should come in groups of {medoid_count}")
                
            vectors += sense_vectors
                
            for i in list(range(0, medoid_count)):
                name_tuple = (Path(sense_file).stem, i)
                names.append(name_tuple)
                
    return np.array(vectors), names

In [8]:
def lexunit_to_record(lexunit):
    for record in senses:
        if record["lexunit"] is None or record["lexunit"] == False:
            continue
            
        if lexunit in record["lexunit"]:
            return record
    
    return None

In [9]:
def get_all_sense_lexunits(lexunit):
    return lexunit_to_record(lexunit)["lexunit"].split(",")

In [10]:
def lexunit_to_synset(lexunit):
    return lexunit_to_record(lexunit)["synset"]

## Actual evaluation

We define our medoids again, and the number of layers of BERT.

In [43]:
medoids = [3, 5, 7, 10]
layer_indices = list(range(0, 13))

We will test the system's performance using all possible combinations.

In [44]:
combinations = list(itertools.product(medoids, layer_indices))

Evaluation is done for a medoid count and a layer index. The evaluation function will load all test set representations (which are conveniently also already created, they are all representations beyond :30 in our dataset). We let the system predict the correct lexunit and compute some evaluation measures.

In [45]:
def evaluate(medoid_count, layer_index):
    candidates, names = create_vector_list(medoid_count, layer_index)
    candidate_tree = cKDTree(candidates)
    
    sense_files = glob(f"{SENSE_EX_VEC}/*")
    sense_files = list(filter(lambda sense_file: Path(sense_file).stem in legal_lexunits, sense_files))
    
    y_true = []
    y_pred = []
    labels = []
    certainty = []
    
    for sense_file in tqdm(sense_files):
        with open(sense_file, "rt") as reader:
            data = json.loads(reader.read())
    
        sense = TestSet(Path(sense_file).stem, data)
        
        #if sense.total_representation_count() <= 30:
        #    continue
            
        if sense.name not in legal_lexunits:
            continue
        
        synset = lexunit_to_synset(sense.name)
                    
        representations = sense.get_representations(layer_index)
        for representation in representations:
            # Add correct prediction and label to list
            y_true.append(synset)
            labels.append(synset)
            
            token_vector = np.array(representation)

            nearest_neighbor_indices = candidate_tree.query(token_vector, k=medoid_count)[1]
            results_lexunit_tuples = list(map(lambda neighbour_index: names[neighbour_index], nearest_neighbor_indices))
            results = list(map(lambda lexunit_tuple: (lexunit_to_synset(lexunit_tuple[0]), lexunit_tuple[1]), results_lexunit_tuples))
            
            sense_guesses = {}
            for sense_medoid, medoid_index in results:
                if not sense_medoid in sense_guesses:
                    sense_guesses[sense_medoid] = 0
                
                sense_guesses[sense_medoid] += 1
        
            # TODO: what if we have a tie?
            max_value = max(sense_guesses.values())
            guesses = [key for key, value in sense_guesses.items() if value == max_value]
            #print(guesses)
            
            if len(guesses) == 1:
                y_pred.append(guesses[0])
                certainty.append(max_value / medoid_count)
            else:
                y_pred.append("UNSURE")
                certainty.append(None)
                
    return y_true, y_pred, labels, certainty, (y_pred.count("UNSURE") / len(y_pred))

We tie everything together using multi processing.

In [46]:
pool = multiprocessing.Pool()
results = pool.starmap(evaluate, combinations)
pool.close()
pool.join()

We write all res

In [47]:
with open("results.json", "wt") as writer:
    writer.write(json.dumps(results))

In [48]:
df_rows = []
index = 0
for y_true, y_pred, labels, certainty, unsure_ratio in results:
    medoid_count, layer_index = combinations[index]
    
    f1 = sklearn.metrics.f1_score(y_true, y_pred, labels=labels, average="macro")
    filtered = list(filter(lambda entry: entry is not None, certainty))
    mean_certainty = 0
    if len(filtered) > 0:
        mean_certainty = statistics.mean(filtered)
    
    row = { "medoid_count": medoid_count,
            "layer_index": layer_index,
            "f1": f1,
            "mean_certainty": mean_certainty,
            "unsure_ratio": unsure_ratio }
    df_rows.append(row)
    index += 1
df = pd.DataFrame.from_dict(df_rows)

In [49]:
df.to_csv("results.csv", index=False)

In [220]:
y_true, y_pred, labels, certainty, unsure_ratio = evaluate(10, 0)

  0%|          | 0/8306 [00:00<?, ?it/s]

10230 10230


  0%|          | 0/1113 [00:00<?, ?it/s]

In [221]:
sklearn.metrics.f1_score(y_true, y_pred, labels=labels, average="macro")

0.7062645184960138

In [222]:
statistics.mean(list(filter(lambda entry: entry is not None, certainty)))

0.658324059189646

In [223]:
unsure_ratio

0.18769879800448655