In [239]:
import json
import numpy as np
import pandas as pd

In [240]:
def load_scores(mat_path, matches_path):
    n_queries = len(json.load(open(matches_path)))
    mat = np.fromfile(mat_path, dtype=np.float32)[4:]
    n_targets = len(mat) // n_queries
    return mat.reshape(n_queries, n_targets)    

In [None]:
def load_files(match_path, distractor_mat_path, prob_mat_path, features_path):
    prob_features = json.load(open(features_path))
    distractor_mat = load_scores(distractor_mat_path, match_path)
    prob_mat = load_scores(prob_mat_path, match_path)
    np.fill_diagonal(prob_mat, prob_mat.min() - 1) # self dist is not interesting!
    prob_ids = np.array(prob_features['id'])
    return prob_mat, distractor_mat, prob_ids

In [None]:
def dist_fuse(mat_1, mat_2):
    return (mat_1 + mat_2) / 2

In [373]:
match_path = "results/2020-02-15/matches_facescrub_megaface_2020-02-15_1000000_1.json"
distractor_mat_path = "results/2020-02-15/otherFiles/facescrub_megaface_2020-02-15_1000000_1.bin"
prob_mat_path = "results/2020-02-15/otherFiles/facescrub_facescrub_2020-02-15.bin"
features_path = "results/2020-02-15/otherFiles/facescrub_features_2020-02-15"

prob_features = json.load(open(features_path))
distractor_mat = load_scores(distractor_mat_path, match_path)
prob_mat = load_scores(prob_mat_path, match_path)
np.fill_diagonal(prob_mat, prob_mat.min() - 1) # self dist is not interesting!
tot_mat = np.concatenate((prob_mat, distractor_mat), axis=1)
prob_ids = np.array(prob_features['id'])

In [214]:
def get_gallary():
    samples = pd.DataFrame([prob_ids, np.arange(len(prob_ids))]).T.groupby(0).apply(lambda x: x.sample(frac=.5).index[0]).sort_values()
    return samples.values, samples.index

In [374]:
rank1_(prob_mat, distractor_mat, prob_ids)

0.9674149740665954

In [None]:
def rank1_scores(prob_mat, distractor_mat, prob_ids):
    """ 
        is the target id in the top 1 (not including self image)
        (prob_mat assuming diag is nan)
        prob_ids are a ground truth of ids ordered probmat entries
    """
    
    out_max = np.max(distractor_mat, axis=1) # best out of distractor set
    np.fill_diagonal(prob_mat, prob_mat.min() - 1)
    tot_mat = np.concatenate((prob_mat, out_max.reshape(3506, 1)), axis=1) # working with a small matrix :)
    return np.max(tot_mat, axis=1)

In [371]:
def rank1_(prob_mat, distractor_mat, prob_ids):
    """ 
        is the target id in the top 1 (not including self image)
        (prob_mat assuming diag is nan)
        prob_ids are a ground truth of ids ordered probmat entries
    """
    
    out_max = np.max(distractor_mat, axis=1) # best out of distractor set
    tot_mat = np.concatenate((prob_mat, out_max.reshape(3506, 1)), axis=1) # working with a small matrix :)
    n_probs = prob_mat.shape[1]
    inf_ = tot_mat.min() - 1
    score = 0
    n_query = 0 
    for query_id in set(prob_ids):
        # probes from iden
        row_mask = list((query_id == prob_ids))
        curr_id_mat = tot_mat[row_mask, :]
        
        n_id_probs, n_probs_and_one_other = curr_id_mat.shape
        query_indices = np.argwhere(row_mask).T[0]
        for row_i, col_i in enumerate(query_indices):
            # nullify all other cols from indices
            col_mask = row_mask.copy()
            col_mask[col_i] = False
            col_mask.append(False)

            other_probs_sub = curr_id_mat[: , col_mask].copy()
            curr_id_mat[: , col_mask] = inf_

            # calc hits
            top_hits = np.delete(np.argmax(curr_id_mat, axis=1), row_i ,0)
            top_id = np.where(top_hits < n_probs, top_hits, 0)
            top_id = np.where(top_hits < n_probs, prob_ids[top_id], 'NaN')
            score += sum(top_id == query_id)
            n_query += len(top_id)

            # get them back
            curr_id_mat[:, col_mask] = other_probs_sub
            
    return score/n_query

In [380]:
def rank10_(prob_mat, distractor_mat, prob_ids):
    """ 
        is the target id in the top 1 (not including self image)
        (prob_mat assuming diag is nan)
        prob_ids are a ground truth of ids ordered probmat entries
    """
    
    tot_mat = np.concatenate((prob_mat, distractor_mat), axis=1) # working with a small matrix :)
    n_probs = prob_mat.shape[1]
    inf_ = tot_mat.min() - 1
    score = 0
    n_query = 0 
    for query_id in set(prob_ids):
        # probes from iden
        row_mask = list((query_id == prob_ids))
        curr_id_mat = tot_mat[row_mask, :]
        
        n_id_probs, n_probs_and_one_other = curr_id_mat.shape
        query_indices = np.argwhere(row_mask).T[0]
        for row_i, col_i in enumerate(query_indices):
            # nullify all other cols from indices
            col_mask = row_mask.copy()
            col_mask[col_i] = False
            col_mask.extend([False]*distractor_mat.shape[1])

            other_probs_sub = curr_id_mat[: , col_mask].copy()
            curr_id_mat[: , col_mask] = inf_

            # calc hits
            top_hits = np.argpartition(curr_id_mat, -10, axis=1)[:, -10:]
            # top hit might be wrong
            top_correct = np.where(top_hits < len(prob_ids), top_hits, np.nan).astype(np.int)
            # TODO from here
            score += sum([np.isin(true_id, prob_ids[top_hit_row]) for top_hit_row,true_id in zip(top_correct, prob_ids)])
            n_query += len(top_id)

            # get them back
            curr_id_mat[:, col_mask] = other_probs_sub
            
    return score/n_query

In [76]:
def rank10(tot_mat, prob_ids):
    """
        is the target id in the top 10 (not including self image)
        prob_mat assuming diag is nan
        prob_ids are a ground truth of ids ordered probmat entries
    """
    
    # works great from here do not touch :)
    top_hits = np.argpartition(tot_mat, -10, axis=1)[:, -10:]
    # top hit might be wrong
    top_correct = np.where(top_hits < len(prob_ids), top_hits, np.nan).astype(np.int)
    return sum([np.isin(true_id, prob_ids[top_hit_row]) for top_hit_row,true_id in zip(top_correct, prob_ids)]) / len(prob_ids)

In [381]:
rank10_(prob_mat, distractor_mat, np.array(prob_features['id']))

IndexError: index -9223372036854775808 is out of bounds for axis 0 with size 3506

In [None]:
def open_set_label_and_score(tot_mat, prob_ids):
    """
         What is the 2nd iden for the prob if he is not in set?
         What is it's score?
    """
    # works great from here do not touch :)
    trail = tot_mat.shape[1]-len(prob_ids)
    id_set = set(prob_ids)
    res_label = []
    res_score = []
    inf_ = tot_mat.min() - 1
    for curr_id in id_set:
        id_mask = (prob_ids == curr_id)
        curr_queries = tot_mat[id_mask]
        curr_queries[:, np.concatenate((id_mask, np.zeros(trail).astype(bool)))] = inf_
        
        top_scores_for_id_queries = np.max(curr_queries, axis=1)
        top_hits_for_id_queries = np.argmax(curr_queries, axis=1)
        top_correct = np.where(top_hits_for_id_queries < len(prob_ids), top_hits_for_id_queries, 0)
        top_2nd_probe_id_or_none = np.where(top_hits_for_id_queries < len(prob_ids), top_correct, np.nan).astype(np.int)
        res_label.extend(top_2nd_probe_id_or_none)
        res_score.extend(top_scores_for_id_queries)
    return res_label, res_score

In [None]:
def open_set_label_and_score_(prob_mat, distractor_mat, prob_ids):
    """
         What is the 2nd iden for the prob if he is not in set?
         What is it's score?
    """
    open_best_score = np.max(distractor_mat, axis=1)
    open_best_ind = np.argmax(distractor_mat, axis=1)
    tot_mat = np.concatenate((prob_mat, out_max.reshape(3506, 1)), axis=1) # working with a small matrix :)
    
    id_set = set(prob_ids)
    res_label = []
    res_score = []
    inf_ = tot_mat.min() - 1
    for curr_id in id_set:
        # probes from iden
        row_mask = list((curr_id == prob_ids))
        curr_id_mat = tot_mat[row_mask, :]
        
        n_id_probs, n_probs_and_one_other = curr_id_mat.shape
        query_indices = np.argwhere(row_mask).T[0]

        # nullify all query cols from indices
        col_mask = row_mask.copy()
        col_mask.append(False)

        other_probs_sub = curr_id_mat[: , col_mask].copy()
        curr_id_mat[: , col_mask] = inf_

        # calc hits
        top_hits = np.argmax(curr_id_mat, axis=1)
        top_hits = np.argmax(curr_id_mat, axis=1)
        top_id = np.where(top_hits < n_probs, top_hits, 0)
        top_id = np.where(top_hits < n_probs, prob_ids[top_id], open_best_ind)
        
        
        # get them back
        curr_id_mat[:, col_mask] = other_probs_sub



        id_mask = list((prob_ids == curr_id))
        curr_queries = tot_mat[id_mask]
        curr_queries[:, np.concatenate((id_mask, np.zeros(trail).astype(bool)))] = inf_
        
        top_scores_for_id_queries = np.max(curr_queries, axis=1)
        top_hits_for_id_queries = np.argmax(curr_queries, axis=1)
        top_correct = np.where(top_hits_for_id_queries < len(prob_ids), top_hits_for_id_queries, 0)
        top_2nd_probe_id_or_none = np.where(top_hits_for_id_queries < len(prob_ids), top_correct, np.nan).astype(np.int)
        res_label.extend(top_2nd_probe_id_or_none)
        res_score.extend(top_scores_for_id_queries)
    return res_label, res_score

In [None]:
labels, scores = open_set_label_and_score(tot_mat, np.array(prob_features['id']))

In [None]:
# TODO open set correlations between 2 models
"""
    for m1 in range(modelsNb):
        for m2 in range(m1+1, modelsNb):
            a = np.sum((open_set_1st_labels[m1] == open_set_1st_labels[m2]))
            b = len(query_label[0])
            print('Corr=', model_names[m1], model_names[m2], a, b, a/b)
"""

In [None]:
def fuse_models(probe_mat_0, distractor_mat_0, probe_mat1, distractor_mat_1, prob_ids):
    top1_scores_0 = top1_scores(probe_mat_0, distractor_mat_0, prob_ids)
    top1_scores_1 = top1_scores(probe_mat_1, distractor_mat_1, prob_ids)
    
    open_set_1st_labels_0, open_set_1st_scores_0  = open_set_label_and_score(probe_mat_0, distractor_mat_0, prob_ids)
    open_set_1st_labels_1, open_set_1st_scores_1 = open_set_label_and_score(probe_mat_1, distractor_mat_1, prob_ids)
    
    a = np.sum((open_set_1st_labels_0 == open_set_1st_labels_1))
    b = len(prob_ids)
    print('  Corr=%0.1f%%' % (100.0*a / b) )

    for target_FTR in target_FTRS:
        for TH in np.arange(0.3, 0.9, 0.00001):
            FTR = np.sum((open_set_1st_labels[m] == open_set_1st_labels[m+1]) & (open_set_1st_scores[m] > TH) & (open_set_1st_scores[m+1] > TH)) / b
            if math.isclose(FTR, target_FTR, abs_tol = 0.0005):
                TTR = np.sum((top1_scores[m] > TH) & (top1_scores[m+1] > TH)) / b
                print('   FTR=%0.1f%%, TTR=%0.1f%%' % (FTR * 100, TTR * 100))
                break