In [1]:
import numpy as np
import pandas as pd

import pickle
from collections import Counter
from scipy.stats import kendalltau
from ast import literal_eval

In [3]:
query_embedding_df = pd.read_csv('../../Embeddings/CNN_Image_Retrieval/caltech101_700-query-features.csv')
db_embedding_df    = pd.read_csv('../../Embeddings/CNN_Image_Retrieval/caltech101_700-dataset-features.csv')

In [4]:
def get_embeddings(query_embeddings, dataset_embeddings):
     
    num_images   =  len(dataset_embeddings)
    num_queries  =  len(query_embeddings)
    
    print('Generating query embedding array')
    
    q_embeddings = []
    
    for query_id in range(num_queries):
        image_name = query_embeddings.iloc[query_id]['image_name']
        query_embedding = np.array(literal_eval(query_embeddings.iloc[query_id]['embedding']))
        q_embeddings.append(query_embedding)

    q_embeddings = np.array(q_embeddings)
    
    
    print('Generating dataset embedding_array')
    
    db_embeddings = []
    
    for item_id in range(num_images):
        if(item_id%1000 == 0):
            print(item_id)
        image_embedding = literal_eval(dataset_embeddings.iloc[item_id]['embedding'])
        db_embeddings.append(image_embedding)
    
    db_embeddings = np.array(db_embeddings)    
    
    return q_embeddings,db_embeddings


In [5]:
def intersection_over_union(items):
    intersection = np.array(items[0])
    union = np.array(items[0])
    
    for i in range(1, len(items)):
        intersection = np.intersect1d(intersection , items[i])
        union = np.union1d(union, items[i])
        
    return len(intersection) / len(union)


In [6]:
def unmask(query_emb, db_emb, num_removed_features = 100, num_iterations = 10, num_top_items = 10):
    
    num_images   =  len(db_emb)
    num_queries  =  len(query_emb)
    
    scores = []
    
    
    for q_idx in range(num_queries):
        print("num query: {}".format(q_idx))
        temp_db_emb = db_emb.copy()
        
        current_query = query_emb[q_idx,:].copy()
        
        query_results = []
        
        for i in range(num_iterations):
            hadamard_prod = np.multiply(temp_db_emb, current_query)
            dot_prod = np.dot(current_query,temp_db_emb.T)
            
            # Retrieve the top k results and their indexes.
            maxed_dot_prod_idx = np.argsort(-dot_prod)
            top_k_idx = maxed_dot_prod_idx[:num_top_items]
            query_results.append(top_k_idx)
            
            #subselect only the top K results to compute the correlation
            sub_hadamard = np.take(hadamard_prod, top_k_idx,axis = 0)
            
            # Making a tuple of (feature_index, correlation_score)
            feature_correlations = []
            
            # Compute the correlation of each feature with the ranking
            for feature_idx in range(sub_hadamard.shape[1]):
                feature_vect = sub_hadamard[:,feature_idx]
                kendall_corr, p_value = kendalltau(feature_vect, np.arange(num_top_items))
                feature_correlations.append((feature_idx, kendall_corr))
                                          
            #sort to get most correlated features
            feature_correlations.sort(key=lambda x : -x[1])
            
            #get the most correlated feature indexes
            remove_feature_idx = [feature_tuple[0] for feature_tuple in feature_correlations]
            remove_feature_idx = remove_feature_idx[:num_removed_features]
            
            # remove the features from both the query and the database
            current_query = np.delete(current_query, remove_feature_idx)
            temp_db_emb = np.delete(temp_db_emb,remove_feature_idx, axis = 1)
        
        query_results = np.array(query_results)
        scores.append(intersection_over_union(query_results))
    
    scores = np.array(scores)
    
    return scores


In [7]:
from itertools import product 
def grid_parameters(parameters):
    for params in product(*parameters.values()):
        yield(dict(zip(parameters.keys(),params)))

In [8]:
# original is 100 / 15 / 100
params = {
'num_removed_features' : [100],
'num_iterations' : [15],
'num_top_items' : [100]}

In [9]:
query_emb, db_emb = get_embeddings(query_embedding_df,db_embedding_df)

Generating query embedding array
Generating dataset embedding_array
0
1000
2000
3000
4000
5000
6000
7000
8000


In [10]:
for args in grid_parameters(params):
    print(args)
    scores = unmask(query_emb, db_emb, **args)
    unmask_df = pd.DataFrame({'path' : query_embedding_df['image_name'] ,'score': scores})
    unmask_df.to_csv('../../Results/unmasking-cnnimageretrieval-caltech101_700.csv',index=False)
    #run /notebooks/Comparisons/Compare-Unmasking.ipynb

{'num_removed_features': 100, 'num_iterations': 15, 'num_top_items': 100}
num query: 0
num query: 1
num query: 2
num query: 3
num query: 4
num query: 5
num query: 6
num query: 7
num query: 8
num query: 9
num query: 10
num query: 11
num query: 12
num query: 13
num query: 14
num query: 15
num query: 16
num query: 17
num query: 18
num query: 19
num query: 20
num query: 21
num query: 22
num query: 23
num query: 24
num query: 25
num query: 26
num query: 27
num query: 28
num query: 29
num query: 30
num query: 31
num query: 32
num query: 33
num query: 34
num query: 35
num query: 36
num query: 37
num query: 38
num query: 39
num query: 40
num query: 41
num query: 42
num query: 43
num query: 44
num query: 45
num query: 46
num query: 47
num query: 48
num query: 49
num query: 50
num query: 51
num query: 52
num query: 53
num query: 54
num query: 55
num query: 56
num query: 57
num query: 58
num query: 59
num query: 60
num query: 61
num query: 62
num query: 63
num query: 64
num query: 65
num query: 6