# Instance-based Vote Count Prediction (VCP) for New Images

In [1]:
# Libraries
import os
import time
import timeit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread
import sklearn
from sklearn.decomposition import PCA

## Constants

In [2]:
# Paths
PROJECT_ROOT = os.path.join('..', '..')
RESULTS_FOLDER_PATH = os.path.join(PROJECT_ROOT, 'results')
SIM_MXS_FOLDER_PATH = os.path.join(RESULTS_FOLDER_PATH, 'matrices')
FEATS_FOLDER_PATH = os.path.join(RESULTS_FOLDER_PATH, 'features')
VOTES_FILE_PATH = os.path.join(RESULTS_FOLDER_PATH, 'votes_summary.csv')
IMAGES_FOLDER_PATH = os.path.join(PROJECT_ROOT, 'imgs')

In [3]:
# Execution parameters
DATA_ID = 'color_hist' # ['incv3_feats', 'incv1_feats', 'color_hist']
SIM_METRIC_ID = 'cosine' # ['euclid', 'cosine']
FULL_ID = DATA_ID + '_' + SIM_METRIC_ID
TIMER_ID = 'timeit' # ['timeit', 'time']
K_RANGE = [1, 3, 5, 10, 20] # list-like
PERF_METRIC_ID = 'rmse' # ['rmse', 'manhattan']
SEED = 42 # Highly recommended to leave it at 42

In [4]:
# Additional configuration
tictoc = timeit.default_timer if TIMER_ID == 'timeit' else time.time

## Loading Data

In [5]:
def gen_file_paths(feat_type, sim_type, feats_folder_path, sim_mx_folder_path):
    # Gen features file path
    feats_file_name = f'{feat_type}.csv'
    feats_file_path = os.path.join(feats_folder_path, feats_file_name)
    # Gen. sim. matrix. file path
    sim_mx_file_name = f'{feat_type}_{sim_type}_sim_matrix.csv'
    sim_mx_file_path = os.path.join(sim_mx_folder_path, sim_mx_file_name)
    return feats_file_path, sim_mx_file_path

In [6]:
# Features and similarity matrix files
FEATS_FILE_PATH, SIM_MX_FILE_PATH = gen_file_paths(DATA_ID, SIM_METRIC_ID, FEATS_FOLDER_PATH, SIM_MXS_FOLDER_PATH)

#### Features

In [7]:
feats_df = pd.read_csv(FEATS_FILE_PATH, index_col=0)
feats_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
1222__pool_table__0.9999995.jpg,178,51,43,49,37,40,54,57,57,54,...,8,5,12,9,13,14,12,12,7,51
1328__coil__0.99999607.jpg,47,39,66,118,112,134,143,164,194,230,...,97,114,127,188,211,172,121,90,61,186
134__zebra__0.9999949.jpg,0,0,1,1,4,4,7,5,12,8,...,34,17,40,14,25,12,2,4,2,13


#### Data (Sim. Matrix between images)

In [8]:
sim_mx_df = pd.read_csv(SIM_MX_FILE_PATH, index_col=0)
sim_mx_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,0.263312,0.88851,0.215065,0.708969,0.767865,0.593644,0.222727,0.321178,0.553927,...,0.70074,0.451223,0.756252,0.625707,0.478438,0.32144,0.365356,0.57988,0.406442,0.522883
1328__coil__0.99999607.jpg,0.263312,0.0,0.772453,0.206157,0.569966,0.711564,0.474664,0.143471,0.456048,0.385842,...,0.576,0.27986,0.58696,0.484427,0.384185,0.352028,0.156797,0.468385,0.323888,0.377533
134__zebra__0.9999949.jpg,0.88851,0.772453,0.0,0.847642,0.896885,0.874984,0.799678,0.882276,0.803249,0.855287,...,0.810574,0.835181,0.543752,0.838339,0.611558,0.871917,0.645131,0.813053,0.791438,0.744133


#### Votes

In [9]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


Here's a sanity check for vote proportion in our the dataset. In the original XAI-CBR paper, vote proportion was like this:
- IG: 45%
- XRAI: 30%
- LIME: 18%
- ANCHOR: 7%

Also, IG was the most voted technique, at least by hard voting aggregation, with a majority of 62% images.


In [10]:
votes_per_technique = votes_df[['ig', 'xrai', 'lime', 'anchor']].sum()
total_votes = votes_per_technique.sum()
votes_per_technique / total_votes

ig        0.488315
xrai      0.271713
lime      0.183467
anchor    0.056505
dtype: float64

There's a slight variation of these proportions with respect to ones presented in the paper. It seems like some votes from XRAI and ANCHOR techniques drifted out to the IG technique. We'll check this out later, this should not be of great importance in the experiments of this notebook.

## Data Preprocessing

In [11]:
X = sim_mx_df.values # Values from sim. matrix
X_names = sim_mx_df.index.values # Names of every image
y = votes_df.values[:, :4] # Vote count for each imae
best = votes_df.values[:, -1] # Most voted technique for each image

In [12]:
print(X.shape, X_names.shape, y.shape, best.shape)

(198, 198) (198,) (198, 4) (198,)


#### Instance deletion
Stratified Subsampling cannot be performed onto the dataset because only one instance is best explained with ANCHOR. Due to the very small importance of that instance in the dataset, we will continue without that instance (i.e. we will find that instance and remove it from the dataset).

In [13]:
# At what index is the anchor instance located?
anchor_idxs = np.argwhere(best == 'anchor')[0]
anchor_idxs

array([155], dtype=int64)

In [14]:
# What's the name of that image and its associated technique?
X_names[anchor_idxs], best[anchor_idxs]

(array(['2411942__zebra__0.99999654.jpg'], dtype=object),
 array(['anchor'], dtype=object))

In [15]:
# Delete that instance from all data partitions (X, y, etc.)
X = np.delete(X, anchor_idxs, axis=0)
X = np.delete(X, anchor_idxs, axis=1) # Twice in sim. matrix (both rows and columns)
X_names = np.delete(X_names, anchor_idxs, axis=0)
y = np.delete(y, anchor_idxs, axis=0)
best = np.delete(best, anchor_idxs, axis=0)

In [16]:
print(X.shape, X_names.shape, y.shape, best.shape)

(197, 197) (197,) (197, 4) (197,)


## Splitting and Fold Creation

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit as SSS
from sklearn.model_selection import ShuffleSplit as SS

In [18]:
# Change this constant to toogle stratified sampling on/off
STRATIFIED = True

In [19]:
# Perform split
splitter = None
if STRATIFIED: splitter = SSS(n_splits=5, test_size=0.2, random_state=42)
else: splitter = SS(n_splits=5, test_size=0.2, random_state=42)
splits = splitter.split(X, best)
splits = list(splits)

In [20]:
splits[0]

(array([192, 147, 177,  11, 140,  51, 127, 118, 172, 191,  62, 124, 115,
         80, 190, 142,  92,  69,  25,  14,  42,   3, 185,  90,  10,  76,
        176, 114,  44,  98, 166, 121,  79, 170,   1, 183,  28,  31, 155,
         75, 156, 101, 171,  13, 110, 122,  38,  27, 136,  20,   6,  56,
         35,  59, 139,  33,  78,  82,  21, 167, 117,  12,  49,  15,   5,
        152, 132,  81,  61, 163, 175,  91,   7, 174, 135,  74, 193, 129,
         60,  96,  50, 161, 159, 145, 126,  19,  65, 188,  73,  89, 133,
        179,  40,  86, 112,  26, 168, 189, 149,  94, 194,  18, 138, 169,
        102,  97,  71, 130,  53,  99, 148, 154,   8,  34, 182, 105,  55,
         95, 153,  72, 144,  77,  52,  30,   9,  37,   4,  93, 128, 137,
        195, 160, 111,  45, 164, 151,  29,  48,  70,  43,  57, 157,  39,
        141,  85, 150,  67,   0,  47, 113,  32,  17, 131, 180,  66, 100,
        186], dtype=int64),
 array([ 54, 187, 103,  23, 104, 108, 181,  64, 109, 134,  16, 146,   2,
        116, 106, 119, 

## Vote Count Prediction (using feat prototypes and vote prototypes)

# I NEED TO CALCULATE THE DISTANCES, NOT TAKE THEM!!!!!

In [21]:
def dist_to_prt(p1, p2, metric):
    if metric=='euclid': return np.linalg.norm(p1-p2)
    if metric=='cosine': return 1 - (np.dot(p1, p2)/(np.linalg.norm(p1) * np.linalg.norm(p2)))
    else: raise Exception(f'Unknown metric type : {metric}')

def calc_dist_to_train_imgs(test_img_feats, train_feats, metric):
    distances = []
    for train_img_feats in train_feats:
        dist = dist_to_prt(test_img_feats, train_img_feats, metric)
        distances.append(dist)
    return np.array(distances)

def get_nearest_instances_indices(dist_to_train_imgs, train_idxs, k):
    if k >= len(dist_to_train_imgs): nearest_train_idxs = train_idxs
    else:
        nearest_train_idxs = []
        for i in range(k): # K times...
            nearest_train_idx, min_dist = None, np.inf
            # ...iterate searching the nearest image
            for dist, train_idx in zip(dist_to_train_imgs, train_idxs):
                if train_idx in nearest_train_idxs: continue # ignore prev. found nearest clusters
                if dist < min_dist: nearest_train_idx, min_dist = train_idx, dist
            nearest_train_idxs.append(nearest_train_idx)
        nearest_train_idxs = np.array(nearest_train_idxs)
    return nearest_train_idxs

def get_indiv_vote_predictions(split, image_votes, image_feats, image_names, metric, k):
    vote_predictions = {}
    query_times = {}
    # Prepare data
    train_idxs = split[0]
    test_idxs = split[1]
    # For each test image...
    for test_img_idx in test_idxs:
        start = tictoc()
        # Get distances from test img to each train image
        dist_to_train_imgs = calc_dist_to_train_imgs(image_feats[test_img_idx], image_feats[train_idxs], metric) # X[test_img_idx, train_idxs]
        # Using those distances, find the nearest k clusters
        kn_train_idxs =  get_nearest_instances_indices(dist_to_train_imgs, train_idxs, k)
        # Aggregate the vote counts associated with those instances
        nearest_vote_counts = [image_votes[train_idx] for train_idx in kn_train_idxs]
        unrounded_vcp = np.average(np.array(nearest_vote_counts, np.float64), axis=0)
        vote_count_prediction = np.round(unrounded_vcp, 0).astype(np.int8) # int parsing needed?
        # Associate an image's name with its vote prediction
        test_img_name = image_names[test_img_idx]
        vote_predictions[test_img_name] = vote_count_prediction
        # Calculate query time
        end = tictoc()
        query_time = end - start
        query_times[test_img_name] = query_time
    return vote_predictions, query_times

def get_global_vote_predictions(splits, image_votes, image_feats, image_names, metric, k):
    '''Maps indices of test folds/splits with the vote count predictions generated'''
    global_vote_predictions = {}
    global_query_times = {}
    for split_idx in range(len(splits)):
        # Generate predictions and query times.
        vote_preds,query_times = get_indiv_vote_predictions(splits[split_idx], image_votes, image_feats, image_names, metric, k)
        # Create { split_idx: vote_predictions } and { split_idx : query_times } pairs
        global_vote_predictions[split_idx] = vote_preds
        global_query_times[split_idx] = query_times
    return global_vote_predictions, global_query_times

In [22]:
def batch_get_predictions(k_range):
    batch_predictions, batch_query_times = {}, {}
    for k in k_range:
        preds, query_times = get_global_vote_predictions(splits, y, feats_df.values, X_names, SIM_METRIC_ID, k)
        batch_predictions[k] = preds
        batch_query_times[k] = query_times
    return batch_predictions, batch_query_times

### Generating predictions

In [23]:
k_range = K_RANGE # Substitute if neccesary
batch_results = batch_get_predictions(k_range)
batch_global_predictions, batch_global_query_times = batch_results

In [24]:
# For k=5, in the test split #0, show generated predictions
batch_global_predictions[5][0]

{'2388889__hotdog__0.99999714.jpg': array([9, 3, 4, 0], dtype=int8),
 '2417881__zebra__0.9999945.jpg': array([5, 4, 4, 0], dtype=int8),
 '2403403__banana__0.9999926.jpg': array([9, 1, 4, 1], dtype=int8),
 '2381941__zebra__0.9999914.jpg': array([6, 3, 4, 1], dtype=int8),
 '2403741__zebra__0.99999523.jpg': array([7, 2, 3, 2], dtype=int8),
 '2404281__zebra__0.999998.jpg': array([7, 1, 4, 3], dtype=int8),
 '2416627__zebra__0.9999987.jpg': array([8, 2, 4, 2], dtype=int8),
 '2391964__flamingo__1.0.jpg': array([6, 4, 3, 1], dtype=int8),
 '2404583__umbrella__0.99999297.jpg': array([5, 3, 5, 1], dtype=int8),
 '2409637__four-poster__0.99999464.jpg': array([7, 3, 4, 1], dtype=int8),
 '2380669__parking_meter__0.9999993.jpg': array([6, 2, 5, 0], dtype=int8),
 '2411196__crane__0.9999995.jpg': array([6, 2, 3, 1], dtype=int8),
 '134__zebra__0.9999949.jpg': array([6, 3, 4, 1], dtype=int8),
 '2405905__traffic_light__0.99999535.jpg': array([7, 3, 5, 1], dtype=int8),
 '2404127__zebra__0.9999933.jpg': arra

In [25]:
# For k=5, in the test split #0, show query times
batch_global_query_times[5][0]

{'2388889__hotdog__0.99999714.jpg': 0.007143499999999747,
 '2417881__zebra__0.9999945.jpg': 0.007079299999999122,
 '2403403__banana__0.9999926.jpg': 0.0071256999999995685,
 '2381941__zebra__0.9999914.jpg': 0.007161099999999365,
 '2403741__zebra__0.99999523.jpg': 0.007027900000000642,
 '2404281__zebra__0.999998.jpg': 0.007281800000001226,
 '2416627__zebra__0.9999987.jpg': 0.007002500000000467,
 '2391964__flamingo__1.0.jpg': 0.007065300000000718,
 '2404583__umbrella__0.99999297.jpg': 0.0069512000000013785,
 '2409637__four-poster__0.99999464.jpg': 0.00710589999999911,
 '2380669__parking_meter__0.9999993.jpg': 0.007008400000000137,
 '2411196__crane__0.9999995.jpg': 0.007530500000001439,
 '134__zebra__0.9999949.jpg': 0.006993700000000658,
 '2405905__traffic_light__0.99999535.jpg': 0.00834040000000158,
 '2404127__zebra__0.9999933.jpg': 0.008501799999999449,
 '2406857__zebra__0.9999894.jpg': 0.008614599999999584,
 '2414277__zebra__0.9999908.jpg': 0.006957899999999739,
 '2385298__parking_meter

## Metric Evaluation

### Vote Accuracy (RMSE, Manhattan, etc...)

In [26]:
def calc_vote_distance(p1, p2, metric):
    if metric == 'rmse': return np.sum(np.square(p1 - p2))
    elif metric == 'manhattan': return np.sum(np.abs(p1 - p2))
    else: raise Exception(f'Unknown performance metric : {metric}')

def eval_indiv_vote_predictions(vote_predictions, perf_metric):
    vote_distances = []
    for img_name, vote_pred in vote_predictions.items():
        # Fetch real votes and compare with vote predictions
        real_votes = votes_df.loc[img_name].values[:4]
        distance = calc_vote_distance(real_votes, vote_pred, perf_metric)
        vote_distances.append(distance)
    vote_distances = np.array(vote_distances)
    if perf_metric=='rmse': metrics = {'rmse': round(np.sqrt(np.average(vote_distances)), 2)}
    elif perf_metric=='manhattan': metrics = {'manhattan': (np.average(vote_distances), 2)}
    else: raise Exception(f'Unknown performance metric : {perf_metric}')
    return metrics

def eval_global_vote_predictions(global_vote_predictions, perf_metric):
    global_metrics = {}
    # Calculate metrics for each split
    for split_idx, vote_predictions in global_vote_predictions.items():
        global_metrics[split_idx] = eval_indiv_vote_predictions(vote_predictions, perf_metric)
    # Aggregate metrics for all splits
    global_metrics['global'] = {}
    for metric_type in global_metrics[0].keys():
        metrics_per_type = [metrics[metric_type] for split_key, metrics in global_metrics.items() if split_key != 'global']
        avgd_metrics_per_type = np.round(np.average(np.array(metrics_per_type), axis=0), 2)
        global_metrics['global'][metric_type] = avgd_metrics_per_type
    return global_metrics

### Query Times

In [27]:
def eval_indiv_query_times(query_times):
    times = [query_time for _, query_time in query_times.items()]
    avg_time = np.average(np.array(times))
    return avg_time

def eval_global_query_times(global_query_times):
    global_metrics = {}
    for split_idx, query_times in global_query_times.items():
        global_metrics[split_idx] = eval_indiv_query_times(query_times)
    avg_times = [avg_time for _, avg_time in global_metrics.items()]
    global_metrics['global'] = np.average(np.array(avg_times))
    return global_metrics

### Main Code

In [28]:
def evaluate_results(gbl_vote_preds, gbl_query_times, perf_metric):
    vote_metrics = eval_global_vote_predictions(gbl_vote_preds, perf_metric)
    query_metrics = eval_global_query_times(gbl_query_times)
    return {
        'vote_metrics': vote_metrics,
        'query_metrics': query_metrics
    }

In [29]:
def batch_evaluate_results(batch_gbl_vote_preds, batch_gbl_query_times, perf_metric):
    batch_results = {}
    for k_idx in batch_gbl_vote_preds.keys():
        results = evaluate_results(batch_gbl_vote_preds[k_idx], batch_gbl_query_times[k_idx], perf_metric)
        batch_results[k_idx] = results
    return batch_results

### Compute Both Vote Distances and Query Times

In [30]:
batch_global_results = batch_evaluate_results(batch_global_predictions, batch_global_query_times, perf_metric=PERF_METRIC_ID)

#### Display Results

In [31]:
def display_batch_results(batch_gbl_results, full_id):
    print('DISPLAYING RESULTS')
    print(f'FOR {full_id}')
    print('-------------------------')
    for k_idx, gbl_results in batch_gbl_results.items():
        seconds = gbl_results["query_metrics"]["global"]
        microseconds = round(seconds * 10**6)
        print(f'FOR K = {k_idx}')
        print(f'Avg. Accuracy : {gbl_results["vote_metrics"]["global"]}')
        print(f'Avg. Query Time : {microseconds} microseconds')
        print('-------------------------------------')

In [32]:
# Watch out! Query times can change!
display_batch_results(batch_global_results, FULL_ID)

DISPLAYING RESULTS
FOR color_hist_cosine
-------------------------
FOR K = 1
Avg. Accuracy : {'rmse': 6.39}
Avg. Query Time : 6934 microseconds
-------------------------------------
FOR K = 3
Avg. Accuracy : {'rmse': 5.04}
Avg. Query Time : 6944 microseconds
-------------------------------------
FOR K = 5
Avg. Accuracy : {'rmse': 4.7}
Avg. Query Time : 7056 microseconds
-------------------------------------
FOR K = 10
Avg. Accuracy : {'rmse': 4.48}
Avg. Query Time : 7368 microseconds
-------------------------------------
FOR K = 20
Avg. Accuracy : {'rmse': 4.39}
Avg. Query Time : 8232 microseconds
-------------------------------------


Additional note: Average query times are similar when using both time and timeit.

In [33]:
tictoc

<function time.perf_counter>