# RETRIEVAL METRICS + EXAMPLES

Useful information sources:

METRICS FOR MULTI-CLASS CLASSIFICATION: AN OVERVIEW
https://arxiv.org/pdf/2008.05756.pdf

An Introduction to Information Retrieval
https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

Evaluation Metrics - INLS 509: Information Retrieval
https://ils.unc.edu/courses/2013_spring/inls509_001/lectures/10-EvaluationMetrics.pdf


In [21]:
import pandas as pd
import numpy as np

## Data to evaluate

In [22]:
RUN="mi3dor_trainsynth_testreal_resize"
FEATURE_EXTRACTOR="vit_in21k_finetuned"
SPLIT="traintest_full"

retrieval_df_path = f"./out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/retrieval_results/img_retrieval_df.pickle"
dataset_df_path = f"out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/dataset.csv"

dataset_df = pd.read_csv(dataset_df_path)
img_retrieval_df = pd.read_pickle(retrieval_df_path)

display(img_retrieval_df.head())
display(dataset_df.head())


UnpicklingError: pickle data was truncated

## Example Data

### minimal_example retrieval

In [None]:
# RUN="minimal_example"
# FEATURE_EXTRACTOR="vit_in21k"
# SPLIT="traintest_full"

# retrieval_df_path = f"./out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/retrieval_results/img_retrieval_df.pickle"
# dataset_df_path = f"out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/dataset.csv"

# dataset_df = pd.read_csv(dataset_df_path)
# img_retrieval_df = pd.read_pickle(retrieval_df_path)

# display(img_retrieval_df.head())
# display(dataset_df.head())

### Store Results as a row

We want to merge eval results from various runs for comparison

In [None]:
eval_df = pd.DataFrame({"run": [RUN], "feature_extractor": [FEATURE_EXTRACTOR], "split": [SPLIT]})
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full


## Confidence Score Function / Retrievals on mesh level (instead of renders from meshes)

Some function that Calculates confidences for object meshes from image retrievals.

e.g. Weighted by rank in img retrieval, First occurence, ...  

### Basic - First occurrence in retrieval

In [None]:

mesh_retrieval_df = pd.DataFrame()
mesh_retrieval_df['image'] = img_retrieval_df['image']
mesh_retrieval_df['y_true'] = img_retrieval_df['y_true']
mesh_retrieval_df['y_pred_class'] = np.nan
mesh_retrieval_df['y_pred_class'] = mesh_retrieval_df['y_pred_class'].astype(object)
mesh_retrieval_df['y_pred_mesh'] = np.nan
mesh_retrieval_df['y_pred_mesh'] = mesh_retrieval_df['y_pred_mesh'].astype(object)
for i, im_ret in img_retrieval_df.iterrows():
    y_pred_mesh = im_ret['y_pred_mesh']
    seen = set()
    mesh_ret =  np.array([[mesh, im_ret['y_pred_class'][j]] for j, mesh in enumerate(y_pred_mesh) if not (mesh in seen or seen.add(mesh))])
    mesh_retrieval_df['y_pred_mesh'].at[i] = mesh_ret[:, 0]
    mesh_retrieval_df['y_pred_class'].at[i] = mesh_ret[:, 1]



## Description / Justification
### The issue with PR-Curve, ROC-AUC, mAP
If we want to calculate PR-Curve, ROV-AUC and general mAP without cutting retrieval results to some value K we must query **ALL** neieghbours for each FAISS query to make sure we retrieve all relevant images. As this needs far to much computation resources for us, **we limit the image similarity search search to the number of relevant items N_REL**
* Is there a FAISS search index that allows to retrieve all relevant images or even all neighbours for sure??

## Store constant values for this run

In [None]:
# How many relevant meshes per class do we have?
# Group by class
group = dataset_df.groupby('class_label')
# Count meshes per class without nan
# Get Relevant items for specific class: N_REL_MESH['class']
N_REL_MESH = group.apply(lambda x:len(x['mesh'].dropna().unique()))
# How many queries did we perform
N_Q = len(mesh_retrieval_df.index)

## Accuracy

In [None]:
def acc_at_k(y_true, y_pred, k:int=1):
    """ Returns 1 (True) if y_true is at least once in the list of predictions (y_pred)
        from first element to element k or 0 if not.
    """
    y_pred_arr = np.array(y_pred)
    return (sum((y_pred_arr == y_true)[:k].astype(int)) >= 1).astype(int)
    
score_at_1 = 0
score_at_5 = 0
score_at_10 = 0
for i, mesh_ret in mesh_retrieval_df.iterrows():
    score_at_1 += acc_at_k(mesh_ret["y_true"], mesh_ret["y_pred_class"], 1)
    score_at_5 += acc_at_k(mesh_ret["y_true"], mesh_ret["y_pred_class"], 5)
    score_at_10 += acc_at_k(mesh_ret["y_true"], mesh_ret["y_pred_class"], 10)

acc_at_1 = score_at_1/N_Q
acc_at_5 = score_at_5/N_Q
acc_at_10 = score_at_10/N_Q

eval_df = eval_df.assign(acc_at_1=acc_at_1, acc_at_5=acc_at_5, acc_at_10=acc_at_10)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524


## Precision @ K

In [None]:
def avg_precision_at_k(y_true, y_pred, k, n_rel):
    """ Calculate precision@k for k=1 to k=max_k and.
        Returns a list of precision@k for all k respectively.
    """
    y_pred_arr = np.array(y_pred)
    y_pred_relevance = (y_pred_arr == y_true)[:k].astype(int)
    p_at_k = np.array([(sum(y_pred_relevance[:i+1])/(i+1)) for i in range(0, k)])

    if p_at_k.shape[0] > y_pred_relevance.shape[0]:
        diff = p_at_k.shape[0] - y_pred_relevance.shape[0]
        y_pred_relevance = np.concatenate((y_pred_relevance, np.zeros(diff)))
    
    avg_p = (1/min(k,n_rel)) * sum(p_at_k[y_pred_relevance.astype(bool)])
    return avg_p


## Recall @ K

In [None]:
# def avg_recall_at_k(y_true, y_pred, k, n_rel):
#     """ Calculate precision@k for k=1 to k=max_k and.
#         Returns a list of precision@k for all k respectively.
#     """
#     y_pred_arr = np.array(y_pred)
#     y_pred_relevance = (y_pred_arr == y_true)[:k].astype(int)
#     p_at_k = np.array([(sum(y_pred_relevance[:i+1])/(i+1)) for i in range(0, k)])

#     # !NOTE: Is it okay to just fill up with zeros (wrong preds) if we have n_predicitons < K ? 
#     if p_at_k.shape[0] > y_pred_relevance.shape[0]:
#         diff = p_at_k.shape[0] - y_pred_relevance.shape[0]
#         y_pred_relevance = np.concatenate((y_pred_relevance, np.zeros(diff)))
    
#     # NOTE: we calc 1/n_rel instead 1/k because lower scores if all rel items are already retrieved makes no sense (!?)
#     avg_p = (1/min(k,n_rel)) * sum(p_at_k[y_pred_relevance.astype(bool)])
#     return avg_p

### mAP@1

In [None]:
K=1
mesh_retrieval_df[f'ap_at_{K}'] = np.nan
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    n_rel = N_REL_MESH[y_true]
    mesh_retrieval_df[f'ap_at_{K}'].at[i] = avg_precision_at_k(y_true, y_pred, k=K, n_rel=n_rel)

ap_at_1 = mesh_retrieval_df[f'ap_at_{K}'].values
map_at_1 = (1/ap_at_1.shape[0]) * sum(ap_at_1)

eval_df = eval_df.assign(map_at_1=map_at_1)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714


### mAP@5

In [None]:
K=5
mesh_retrieval_df[f'ap_at_{K}'] = np.nan
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    n_rel = N_REL_MESH[y_true]
    mesh_retrieval_df[f'ap_at_{K}'].at[i] = avg_precision_at_k(y_true, y_pred, k=K, n_rel=n_rel)

ap_at_5 = mesh_retrieval_df[f'ap_at_{K}'].values
map_at_5 = (1/ap_at_5.shape[0]) * sum(ap_at_5)

eval_df = eval_df.assign(map_at_5=map_at_5)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243


### mAP@10

In [None]:
K=10
mesh_retrieval_df[f'ap_at_{K}'] = np.nan
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    n_rel = N_REL_MESH[y_true]
    mesh_retrieval_df[f'ap_at_{K}'].at[i] = avg_precision_at_k(y_true, y_pred, k=K, n_rel=n_rel)

ap_at_10 = mesh_retrieval_df[f'ap_at_{K}'].values
map_at_10 = (1/ap_at_10.shape[0]) * sum(ap_at_10)

eval_df = eval_df.assign(map_at_10=map_at_10)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866


### mAP@100

In [None]:
K=100
mesh_retrieval_df[f'ap_at_{K}'] = np.nan
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    n_rel = N_REL_MESH[y_true]
    mesh_retrieval_df[f'ap_at_{K}'].at[i] = avg_precision_at_k(y_true, y_pred, k=K, n_rel=n_rel)

ap_at_100 = mesh_retrieval_df[f'ap_at_{K}'].values
map_at_100 = (1/ap_at_100.shape[0]) * sum(ap_at_100)

eval_df = eval_df.assign(map_at_100=map_at_100)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10,map_at_100
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866,0.016975


## Average R-Precision (ARP)

R-Precision is the mean average precision @ k, where k is always the number of relevant documents that can be retrieved.


In [None]:
mesh_retrieval_df['ap_at_r'] = np.nan
mesh_retrieval_df['ar_at_r'] = np.nan
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    n_rel = N_REL_MESH[y_true]
    mesh_retrieval_df['ap_at_r'].at[i] = avg_precision_at_k(y_true, y_pred, k=n_rel, n_rel=n_rel)
    mesh_retrieval_df['ar_at_r'].at[i] = avg_precision_at_k(y_true, y_pred, k=n_rel, n_rel=n_rel)

ap_at_r = mesh_retrieval_df['ap_at_r'].values
# NOTE: For K=|Rel| neighbour queries, Precision and Recall are always the same (Break-Even Point) 
map_at_r = (1/ap_at_r.shape[0]) * sum(ap_at_r)

eval_df = eval_df.assign(map_at_r=map_at_r)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10,map_at_100,map_at_r
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866,0.016975,0.013191


## Average Normalized Discounted Cumulative Gain (nDCG)

In [None]:
def discountedCumulativeGain(pred_scores):
    dcg = 0
    for idx, val in enumerate(pred_scores): 
        # add 2 because python 0-index
        score = (2**val - 1) / (np.log2(idx + 2) )
        dcg += score
    return dcg

def norm_dcg(pred_scores, idcg):
    dcg = discountedCumulativeGain(pred_scores)
    return dcg/idcg



In [None]:
mesh_retrieval_df['ndcg'] = np.nan
mesh_retrieval_df['ndcg_at_5'] = np.nan
mesh_retrieval_df['ndcg_at_10'] = np.nan
mesh_retrieval_df['ndcg_at_100'] = np.nan
N_Q = len(mesh_retrieval_df.index)
avg_ndcg = 0
avg_ndcg_at_5 = 0
avg_ndcg_at_10 = 0
avg_ndcg_at_100 = 0
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    y_pred_arr = np.array(y_pred)
    pred_scores = (y_pred_arr == y_true).astype(int)
    n_rel = N_REL_MESH[y_true]
    idcg_scores = np.concatenate((np.ones(n_rel), np.zeros(len(pred_scores) - n_rel)))
    # print(f'{pred_scores=}')
    # print(f'{idcg_scores=}')
    ndcg = norm_dcg(pred_scores, discountedCumulativeGain(idcg_scores))
    ndcg_at_5 = norm_dcg(pred_scores[:5], discountedCumulativeGain(idcg_scores[:5]))
    ndcg_at_10 = norm_dcg(pred_scores[:10], discountedCumulativeGain(idcg_scores[:10]))
    ndcg_at_100 = norm_dcg(pred_scores[:100], discountedCumulativeGain(idcg_scores[:100]))
    mesh_retrieval_df['ndcg'].at[i] = ndcg
    mesh_retrieval_df['ndcg_at_5'].at[i] = ndcg_at_5
    mesh_retrieval_df['ndcg_at_10'].at[i] = ndcg_at_10
    mesh_retrieval_df['ndcg_at_100'].at[i] = ndcg_at_100
    avg_ndcg += ndcg / N_Q
    avg_ndcg_at_5 += ndcg_at_5 / N_Q
    avg_ndcg_at_10 += ndcg_at_10 / N_Q
    avg_ndcg_at_100 += ndcg_at_100 / N_Q

eval_df = eval_df.assign(avg_ndcg=avg_ndcg, avg_ndcg_at_5=avg_ndcg_at_5, avg_ndcg_at_10=avg_ndcg_at_10,avg_ndcg_at_100=avg_ndcg_at_100,)
display(eval_df.head())


Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10,map_at_100,map_at_r,avg_ndcg,avg_ndcg_at_5,avg_ndcg_at_10,avg_ndcg_at_100
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866,0.016975,0.013191,0.1718,0.069841,0.066533,0.060589


## Mean Reciprocal Rank (MRR)

At which rank does a relevant item occur first

In [None]:
def reciprocal_rank(pred_scores):
    # +1 because index starts at 0
    rank = np.argmax(pred_scores) + 1
    return 1/rank

In [None]:
mesh_retrieval_df['reciprocal_rank'] = np.nan
mrr = 0
N_Q = len(mesh_retrieval_df.index)
for i, mesh_ret in mesh_retrieval_df.iterrows():
    y_true = mesh_ret['y_true']
    y_pred = mesh_ret['y_pred_class']
    y_pred_arr = np.array(y_pred)
    pred_scores = (y_pred_arr == y_true).astype(int)
    rr = reciprocal_rank(pred_scores)
    mesh_retrieval_df['reciprocal_rank'].at[i] = rr
    mrr += reciprocal_rank(pred_scores) / N_Q

eval_df = eval_df.assign(mrr=mrr)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10,map_at_100,map_at_r,avg_ndcg,avg_ndcg_at_5,avg_ndcg_at_10,avg_ndcg_at_100,mrr
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866,0.016975,0.013191,0.1718,0.069841,0.066533,0.060589,0.208152


# Save Mesh Retrieval Results
(and eval resutls for each query)

In [None]:
mesh_retrieval_df_path = f"./out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/retrieval_results/mesh_retrieval_df.csv"
mesh_retrieval_df.to_csv(mesh_retrieval_df_path)
display(mesh_retrieval_df.head())

Unnamed: 0,image,y_true,y_pred_class,y_pred_mesh,ap_at_1,ap_at_5,ap_at_10,ap_at_100,ap_at_r,ar_at_r,ndcg,ndcg_at_5,ndcg_at_10,ndcg_at_100,reciprocal_rank
0,./data/datasets/mi3dor/test_real/images/radio/...,radio,"[keyboard, keyboard, keyboard, keyboard, keybo...","[keyboard_train_0067, keyboard_train_0057, key...",0.0,0.0,0.0,0.00095,0.00095,0.00095,0.085077,0.0,0.0,0.023772,0.02439
1,./data/datasets/mi3dor/test_real/images/radio/...,radio,"[keyboard, plant, bed, bed, plant, stairs, bed...","[keyboard_train_0031, plant_train_0078, bed_tr...",0.0,0.0,0.0,0.0,0.0,0.0,0.03373,0.0,0.0,0.0,0.006711
2,./data/datasets/mi3dor/test_real/images/radio/...,radio,"[stairs, stairs, bed, keyboard, keyboard, rifl...","[stairs_train_0068, stairs_train_0001, bed_tra...",0.0,0.0,0.0,0.006245,0.003166,0.003166,0.144173,0.0,0.0,0.074265,0.076923
3,./data/datasets/mi3dor/test_real/images/radio/...,radio,"[flower_pot, bed, bed, vase, wardrobe, wardrob...","[flower_pot_train_0022, bed_train_0169, bed_tr...",0.0,0.0,0.0,0.006714,0.00374,0.00374,0.120838,0.0,0.0,0.075412,0.0625
4,./data/datasets/mi3dor/test_real/images/radio/...,radio,"[plant, keyboard, keyboard, guitar, flower_pot...","[plant_train_0190, keyboard_train_0068, keyboa...",0.0,0.0,0.0,0.001349,0.000358,0.000358,0.049716,0.0,0.0,0.033252,0.022222


# Save eval results as csv

In [None]:
eval_df_path = f"./out/{RUN}/{FEATURE_EXTRACTOR}/{SPLIT}/retrieval_results/eval_results.csv"
eval_df.to_csv(eval_df_path)
display(eval_df.head())

Unnamed: 0,run,feature_extractor,split,acc_at_1,acc_at_5,acc_at_10,map_at_1,map_at_5,map_at_10,map_at_100,map_at_r,avg_ndcg,avg_ndcg_at_5,avg_ndcg_at_10,avg_ndcg_at_100,mrr
0,mi3dor_trainsynth_testreal_resize,dino_vitb16_in1k,traintest_full,0.075714,0.188095,0.261524,0.075714,0.046243,0.035866,0.016975,0.013191,0.1718,0.069841,0.066533,0.060589,0.208152
