# Cluster-based Vote Count Prediction (VCP) for New Images

In [1]:
# Libraries
import os
import time
import timeit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread
import sklearn
from sklearn.decomposition import PCA

## Constants

In [2]:
# Paths
PROJECT_ROOT = os.path.join('..', '..')
RESULTS_FOLDER_PATH = os.path.join(PROJECT_ROOT, 'results')
SIM_MXS_FOLDER_PATH = os.path.join(RESULTS_FOLDER_PATH, 'matrices')
FEATS_FOLDER_PATH = os.path.join(RESULTS_FOLDER_PATH, 'features')
IMAGES_FOLDER_PATH = os.path.join(PROJECT_ROOT, 'imgs')

In [129]:
# Execution parameters
DATA_ID = 'incv3'
SIM_METRIC_ID = 'euclid'
FULL_ID = DATA_ID + '-' + SIM_METRIC_ID
VOTE_PRT_TYPE = 'avg-votes' # ['avg-votes']
FEAT_PRT_TYPE = 'avg-feats' # ['avgf-feats', 'pca']
VOTE_PRED_TYPE = 'prt-based' # ['prt-based', 'prtless', 'pca-proj']
# K_RANGE = [1, 3, 5, 7, 11] # list-like
SEED = 42 # Highly recommended to leave it at 42

## Loading Data

In [4]:
# Similarity matrix
SIM_MX_FILE_PATH = os.path.join('..', '..', 'results', 'matrices', 'incv3_feats_euclid_sim_matrix.csv')

In [5]:
# Casted votes for each image
VOTES_FILE_PATH = os.path.join('..', '..', 'results', 'votes_summary.csv')

In [6]:
# Image features
FEATS_FILE_PATH = os.path.join('..', '..', 'results', 'features', 'incv3_feats.csv')

#### Data (Sim. Matrix between images)

In [7]:
sim_mx_df = pd.read_csv(SIM_MX_FILE_PATH, index_col=0)
sim_mx_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,24.89917,22.871903,25.031346,23.751015,25.26349,23.017009,23.726696,26.78418,23.866859,...,23.893972,28.554533,8.689397,23.648353,23.595179,24.177,23.629427,26.545261,27.091866,23.200351
1328__coil__0.99999607.jpg,24.89917,0.0,17.500986,18.181155,17.596692,19.09994,17.112792,17.488145,20.532969,17.789077,...,17.571292,23.570307,21.901239,16.987964,17.437562,19.139305,17.123877,19.577529,20.261407,16.637508
134__zebra__0.9999949.jpg,22.871903,17.500986,0.0,17.436307,7.257223,7.579433,6.056684,16.007572,19.907645,6.56977,...,5.660697,22.369222,19.397503,15.556074,16.074305,16.874761,16.384309,17.144394,17.840449,14.699963


#### Votes

In [8]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


Here's a sanity check for vote proportion in our the dataset. In the original XAI-CBR paper, vote proportion was like this:
- IG: 45%
- XRAI: 30%
- LIME: 18%
- ANCHOR: 7%

Also, IG was the most voted technique, at least by hard voting aggregation, with a majority of 62% images.


In [9]:
votes_per_technique = votes_df[['ig', 'xrai', 'lime', 'anchor']].sum()
total_votes = votes_per_technique.sum()
votes_per_technique / total_votes

ig        0.488315
xrai      0.271713
lime      0.183467
anchor    0.056505
dtype: float64

There's a slight variation of these proportions with respect to ones presented in the paper. It seems like some votes from XRAI and ANCHOR techniques drifted out to the IG technique. We'll check this out later, this should not be of great importance in the experiments of this notebook.

#### Features

In [10]:
feats_df = pd.read_csv(FEATS_FILE_PATH, index_col=0)
feats_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
1222__pool_table__0.9999995.jpg,0.830638,0.109613,0.027107,0.056101,0.123813,0.364547,0.354803,0.331838,0.018715,0.060355,...,1.167887,0.901976,0.245693,0.714952,0.24512,0.052343,0.192633,0.7122,0.008294,0.290266
1328__coil__0.99999607.jpg,0.291597,0.385156,0.047256,0.071147,0.002932,0.182395,0.019829,0.046854,0.122627,0.061063,...,0.029534,0.404313,0.059962,0.778392,0.700241,0.813751,0.42485,0.686327,0.696242,0.441459
134__zebra__0.9999949.jpg,1.088536,0.071769,0.176484,0.168589,0.050061,0.311524,0.057534,0.080351,0.880801,0.554862,...,0.007061,0.055804,0.03767,0.016626,0.224138,0.075299,0.143866,0.851481,0.390153,0.008928


## Data Preprocessing

In [11]:
X = sim_mx_df.values # Values from sim. matrix
X_names = sim_mx_df.index.values # Names of every image
y = votes_df.values[:, :4] # Vote count for each imae
best = votes_df.values[:, -1] # Most voted technique for each image

In [12]:
print(X.shape, X_names.shape, y.shape, best.shape)

(198, 198) (198,) (198, 4) (198,)


#### Instance deletion
Stratified Subsampling cannot be performed onto the dataset because only one instance is best explained with ANCHOR. Due to the very small importance of that instance in the dataset, we will continue without that instance (i.e. we will find that instance and remove it from the dataset).

In [13]:
# At what index is the anchor instance located?
anchor_idxs = np.argwhere(best == 'anchor')[0]
anchor_idxs

array([155], dtype=int64)

In [14]:
# What's the name of that image and its associated technique?
X_names[anchor_idxs], best[anchor_idxs]

(array(['2411942__zebra__0.99999654.jpg'], dtype=object),
 array(['anchor'], dtype=object))

In [15]:
# Delete that instance from all data partitions (X, y, etc.)
X = np.delete(X, anchor_idxs, axis=0)
X = np.delete(X, anchor_idxs, axis=1) # Twice in sim. matrix (both rows and columns)
X_names = np.delete(X_names, anchor_idxs, axis=0)
y = np.delete(y, anchor_idxs, axis=0)
best = np.delete(best, anchor_idxs, axis=0)

In [16]:
print(X.shape, X_names.shape, y.shape, best.shape)

(197, 197) (197,) (197, 4) (197,)


## Splitting and Fold Creation

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit as SSS
from sklearn.model_selection import ShuffleSplit as SS

In [18]:
# Change this constant to toogle stratified sampling on/off
STRATIFIED = True

In [19]:
# Perform split
splitter = None
if STRATIFIED: splitter = SSS(n_splits=5, test_size=0.2, random_state=42)
else: splitter = SS(n_splits=5, test_size=0.2, random_state=42)
splits = splitter.split(X, best)
splits = list(splits)

In [20]:
splits[0]

(array([192, 147, 177,  11, 140,  51, 127, 118, 172, 191,  62, 124, 115,
         80, 190, 142,  92,  69,  25,  14,  42,   3, 185,  90,  10,  76,
        176, 114,  44,  98, 166, 121,  79, 170,   1, 183,  28,  31, 155,
         75, 156, 101, 171,  13, 110, 122,  38,  27, 136,  20,   6,  56,
         35,  59, 139,  33,  78,  82,  21, 167, 117,  12,  49,  15,   5,
        152, 132,  81,  61, 163, 175,  91,   7, 174, 135,  74, 193, 129,
         60,  96,  50, 161, 159, 145, 126,  19,  65, 188,  73,  89, 133,
        179,  40,  86, 112,  26, 168, 189, 149,  94, 194,  18, 138, 169,
        102,  97,  71, 130,  53,  99, 148, 154,   8,  34, 182, 105,  55,
         95, 153,  72, 144,  77,  52,  30,   9,  37,   4,  93, 128, 137,
        195, 160, 111,  45, 164, 151,  29,  48,  70,  43,  57, 157,  39,
        141,  85, 150,  67,   0,  47, 113,  32,  17, 131, 180,  66, 100,
        186], dtype=int64),
 array([ 54, 187, 103,  23, 104, 108, 181,  64, 109, 134,  16, 146,   2,
        116, 106, 119, 

## Clustering (using DBSCAN)

In [21]:
clusterable_params = []

In [22]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [23]:
def get_sim_mx_subset(sim_mx_values, filter_idxs):
    return sim_mx_values.take(filter_idxs, axis=0).take(filter_idxs, axis=1)

In [24]:
def fit_dbscan_sim_mx(data, min_samples, eps_values, 
               min_no_clusters=5, max_no_clusters=np.inf,
               min_clust_instances=None, min_clust_instances_pct=0.85,
               max_clust_instances=np.inf):
    '''Performs several DBSCAN clustering runs according to a parameter search grid (using m and epsilon).
    Also, verifies different clustering conditions and calculates sil. score of all valid clustering runs.'''
    # Condition precalculation
    if min_clust_instances_pct: # If % was defined
        min_clust_instances = round(data.shape[0] * min_clust_instances_pct)
    elif not min_clust_instances: # Else, if nominal amount was not specified
        min_clust_instances = 100
    # Code
    scores, clusters, instances = [], [], []
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric='precomputed').fit(data)
            # Get only non anomalous instances and indices
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            non_a_idxs = np.argwhere(non_a==True)
            non_a_idxs = non_a_idxs.reshape(non_a_idxs.shape[0]) # flatten matrix to 1D
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions
            valid_n_clusters = n_clusters >= min_no_clusters and n_clusters <= max_no_clusters
            valid_n_cl_instances = n_instances >= min_clust_instances and n_instances <= max_clust_instances
            if (valid_n_clusters and valid_n_cl_instances):
                # Calculate silhouette score without noise points (i.e. anomalous instances)
                non_a_sim_mx = get_sim_mx_subset(data, non_a_idxs)
                score = silhouette_score(non_a_sim_mx, db.labels_[non_a], metric='precomputed')
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

In [25]:
def print_results(m, eps, scores_df, instances_df, clusters_df):
    '''Given clustering results and specific values for m and epsilon parameters,
    displays further info. about the clustering obtained using those parameters.
    For interactive and visual use only'''
    score = round(scores_df.loc[m][eps], 4)
    instances = instances_df.loc[m][eps]
    clusters = clusters_df.loc[m][eps]
    print(f'DBSCAN using parameters m={m} and eps={eps} yields the next clustering results:')
    print()
    print(f'- Sil. score: {score}')
    print(f'- {instances} clustered instances into {clusters} clusters')
    print(f'- Avg. of {round(instances/clusters, 2)} instances per cluster')

In [26]:
# What's the equivalence in instances of different clustered proportions of the dataset?
for i in np.arange(0.7, 0.95, 0.05):
    print(f'{round(i, 2)}% : about {round(X[splits[0][0]].shape[0] * i, 2)} instances')

0.7% : about 109.9 instances
0.75% : about 117.75 instances
0.8% : about 125.6 instances
0.85% : about 133.45 instances
0.9% : about 141.3 instances


#### Split #0

In [27]:
X_split_0 = get_sim_mx_subset(X, splits[0][0])
X_split_0.shape

(157, 157)

In [28]:
dfs, dfc, dfi = fit_dbscan_sim_mx(X_split_0, range(2, 8), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.525697,0.51783,0.496012,0.469331,0.349045,0.333957,0.284066
3,,,,0.493045,0.467718,0.342455,0.328979,
4,,,,,0.467807,0.344684,0.401042,
5,,,,,0.467807,0.344684,0.401042,
6,,,,,,0.340594,,
7,,,,,,,,


In [29]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.5257
- 134 clustered instances into 15 clusters
- Avg. of 8.93 instances per cluster


In [30]:
clusterable_params.append([2, 11, 0])

#### Split #1

In [31]:
X_split_1 = get_sim_mx_subset(X, splits[1][0])
X_split_1.shape

(157, 157)

In [32]:
dfs, dfc, dfi = fit_dbscan_sim_mx(X_split_1, range(2, 8), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.552471,0.543729,0.509018,0.512489,0.384647,0.346903,0.364139
3,,,,,0.510243,0.405753,0.34257,0.407832
4,,,,,,0.412814,0.421066,0.407832
5,,,,,,0.412814,0.421066,0.407832
6,,,,,,,,
7,,,,,,,,


In [33]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.5525
- 133 clustered instances into 16 clusters
- Avg. of 8.31 instances per cluster


In [34]:
clusterable_params.append([2, 11, 1])

#### Split #2

In [35]:
X_split_2 = get_sim_mx_subset(X, splits[2][0])
X_split_2.shape

(157, 157)

In [36]:
dfs, dfc, dfi = fit_dbscan_sim_mx(X_split_2, range(2, 8), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.534224,0.524801,0.501986,0.399634,0.362913,0.340106,0.292511
3,,,,0.499575,0.395962,0.369744,0.420016,
4,,,,,0.412159,0.369744,0.420016,
5,,,,,0.412159,0.373198,0.420016,
6,,,,,,0.365788,,
7,,,,,,,,


In [37]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.5342
- 133 clustered instances into 15 clusters
- Avg. of 8.87 instances per cluster


In [38]:
clusterable_params.append([2, 11, 2])

#### Split #3

In [39]:
X_split_3 = get_sim_mx_subset(X, splits[3][0])
X_split_3.shape

(157, 157)

In [40]:
dfs, dfc, dfi = fit_dbscan_sim_mx(X_split_3, range(2, 8), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.535411,0.528561,0.506424,0.389468,0.348317,0.341853,0.301573
3,,,,0.501141,0.405223,0.344428,0.41796,
4,,,,,,0.344428,0.41796,
5,,,,,,0.341843,,
6,,,,,,,,
7,,,,,,,,


In [41]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.5354
- 134 clustered instances into 15 clusters
- Avg. of 8.93 instances per cluster


In [42]:
clusterable_params.append([2, 11, 3])

#### Split #4

In [43]:
X_split_4 = get_sim_mx_subset(X, splits[4][0])
X_split_4.shape

(157, 157)

In [44]:
dfs, dfc, dfi = fit_dbscan_sim_mx(X_split_4, range(2, 8), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.529118,0.5229,0.518945,0.478969,0.416594,0.358288,0.344744
3,,,,,0.483984,0.429327,0.35474,
4,,,,,,0.429938,0.426047,
5,,,,,,0.429938,0.426047,
6,,,,,,,0.426047,
7,,,,,,,,


In [45]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.5291
- 133 clustered instances into 15 clusters
- Avg. of 8.87 instances per cluster


In [46]:
clusterable_params.append([2, 11, 4])

#### Clusterable parameters for each split

In [47]:
# [m, epsilon, split_idx]
clusterable_params

[[2, 11, 0], [2, 11, 1], [2, 11, 2], [2, 11, 3], [2, 11, 4]]

## Clustering Results

In [48]:
def get_indiv_clustering_results(params, split, image_names):
    '''Returns a dictionary mapping the name of an image with an integer
    representing the cluster it belongs to'''
    # Fetch data
    train_idxs = split[0]
    sim_mx_subset = get_sim_mx_subset(X, train_idxs)
    img_names = image_names[train_idxs]
    # Perform clustering
    dbscan = DBSCAN(min_samples=params[0], eps=params[1], metric='precomputed')
    dbscan = dbscan.fit(sim_mx_subset)
    # Generate {img_name : label} mapping
    name_label_map = {name: label for name, label in zip(img_names, dbscan.labels_)}
    return name_label_map

def get_global_clustering_results(params_set, splits, image_names):
    '''Returns a dictionary mapping the split index of every parameter set
    in the 'params_set' arg. with the clustering results generated with that parameter set'''
    results = {}
    for params in params_set:
        split_idx = params[2]
        # Create { split_idx: cluster_labels} pair
        results[split_idx] = get_indiv_clustering_results(params, splits[split_idx], image_names)
    return results

In [49]:
global_cl_results = get_global_clustering_results(clusterable_params, splits, X_names)

In [50]:
global_cl_results

{0: {'4573__barrel__0.9999974.jpg': -1,
  '2411372__parking_meter__0.999995.jpg': 0,
  '2415910__zebra__0.9999962.jpg': 1,
  '2380017__zebra__0.9999995.jpg': 1,
  '2410410__ski__0.99999356.jpg': 2,
  '2387305__traffic_light__1.0.jpg': 3,
  '2408884__zebra__0.9999913.jpg': 1,
  '2406581__zebra__0.9999939.jpg': 1,
  '2415102__zebra__0.9999876.jpg': 1,
  '4534__viaduct__0.9999877.jpg': -1,
  '2391862__broccoli__0.99999714.jpg': 4,
  '2408592__goose__0.999998.jpg': -1,
  '2405479__traffic_light__0.9999939.jpg': 3,
  '2396034__remote_control__0.9999856.jpg': -1,
  '4339__manhole_cover__0.99999416.jpg': -1,
  '2410779__parking_meter__0.99999917.jpg': -1,
  '2401383__slug__0.9999933.jpg': -1,
  '2392579__zebra__0.9999969.jpg': 1,
  '2382183__pizza__0.99998593.jpg': 5,
  '2380319__broccoli__0.9999957.jpg': -1,
  '2385461__zebra__0.99998415.jpg': 1,
  '2377471__pizza__0.9999988.jpg': 5,
  '2417421__parking_meter__0.9999999.jpg': 0,
  '2401217__traffic_light__0.9999895.jpg': 3,
  '2379489__parki

In [53]:
# Sanity check: Number of elements should be the same as clusters detected in clustering phase
for split_idx in global_cl_results.keys(): print(len(np.unique(list(global_cl_results[split_idx].values())))-1)

15
16
15
15
15


It was 15, 16, 15, 15, 15. Clustering results is fine.

## Clustering Prototypes

In our experiment, we want to predict the vote count for a new image, based on the proximity it has to the avaliable clusters. These clusters are composed of many data points, so the proximity of a new data point to a cluster can be measured in different ways, like taking the distance between the new point and the nearest clustered point in the dataset.   
However, this approach can be biased when new poins get associated to the cluster taking in account the nearest point of a cluster instead of the overall position of a cluster. To avoid this, for each cluster we calculate a "prototype", a data point which is the centroid of all the data points in a cluster. This way, we can measure the distance to the general position of a cluster in a more confident way.

### Vote prototypes (Solution of cases)

In [54]:
# TODO: SI there another to calculate vote prototypes than to just average the votes
# casted for every image in a specific cluster? 

def gen_indiv_vote_prototypes(cl_result, votes_df, vote_prt_type, ignore_noise=True):
    '''Maps the integer labels of every cluster with its vote prototype'''
    # Separate image votes according to the clusters they belong to
    votes_by_cluster = {}
    for img_name, cl_idx in cl_result.items():
        if ignore_noise and cl_idx == -1: continue # ignore noise cluster
        img_votes = votes_df.loc[img_name].values[:-1] # fetch vote columns (but not "best technique" column)
        if cl_idx not in votes_by_cluster.keys(): votes_by_cluster[cl_idx] = [img_votes]
        else: votes_by_cluster[cl_idx].append(img_votes)
    # For each cluster, calculate their vote prototype
    vote_prts_by_cluster = {}
    for cl_idx, cl_votes in votes_by_cluster.items():
        unrounded_prt = np.average(np.array(cl_votes, 'uint8'), axis=0) # load with uint8 to prevent unwanted floating points
        # Create { cluster_idx : vote_prototype } pairs
        vote_prts_by_cluster[cl_idx] = np.array(np.round(unrounded_prt), 'int')
    return vote_prts_by_cluster
    
def get_global_vote_prototypes(cl_results, votes_df, vote_prt_type, ignore_noise=True):
    '''Maps indices of folds/splits used to generate clusterings with the vote prototypes
    of the different clusters generated'''
    global_vote_prototypes = {}
    for i, cl_result in cl_results.items():
        # Create { split_idx : vote_prototypes_per_cluster } pairs
        global_vote_prototypes[i] = gen_indiv_vote_prototypes(cl_result, votes_df, vote_prt_type, ignore_noise)
    return global_vote_prototypes

### Feature Prototypes (Description of cases)

#### Default Feature Prototyping (Average of features)

In [55]:
def gen_averaged_feat_prototypes(cl_result, feats_df, ignore_noise=True):
    # Separate image votes according to the clusters they belong to
    feats_by_cluster = {}
    for img_name, cl_idx in cl_result.items():
        if ignore_noise and cl_idx == -1: continue # ignore noise cluster
        img_feats = feats_df.loc[img_name].values # fetch features of image
        if cl_idx not in feats_by_cluster.keys(): feats_by_cluster[cl_idx] = [img_feats]
        else: feats_by_cluster[cl_idx].append(img_feats)
    # For each cluster, calculate their feature prototype
    feat_prts_by_cluster = {}
    for cl_idx, cl_feats in feats_by_cluster.items():
        unrounded_prt = np.average(np.array(cl_feats), axis=0)
        feat_prts_by_cluster[cl_idx] = unrounded_prt
        # print(feat_prts_by_cluster[cl_idx])
    return feat_prts_by_cluster

#### PCA Feature Prototyping

In [56]:
def gen_pca_feat_prototypes(cl_result, images_folder_path, ignore_noise=True):
    # For each cluster:
        # Load images (using img_name in cl_result and images folder path)
        # Flatten images and condense them into a matrix
        # Apply PCA over the matrix
        # Store the trained PCA object as a feature prototype with a "cluster_idx" key
    # return pca_objs_per_cluster
    pass

#### General code

In [113]:
def gen_indiv_feat_prototypes(cl_result, feat_prt_type, ignore_noise=True, **kwargs):
    '''Maps the integer labels of every cluster with its feature prototype'''
    # Execute feature prototyping code according to selected prototyping type
    if feat_prt_type == 'avg-feats':
        feat_prts_per_cluster = gen_averaged_feat_prototypes(cl_result,
                                                             feats_df = kwargs['feats_df'],
                                                             ignore_noise=ignore_noise)
    elif feat_prt_type == 'pca':
        feat_prts_per_cluster = gen_pca_feat_prototypes(cl_result,
                                                        images_folder_path = kwargs['images_folder_path'],
                                                        ignore_noise=ignore_noise)
    else:
        raise Exception(f'Unknown feature prot. type : {feat_prt_type}')
    return feat_prts_per_cluster
    
def get_global_feat_prototypes(cl_results, feat_prt_type, ignore_noise=True, **kwargs):
    '''Maps indices of folds/splits used to generate clusterings with the feat. prototypes
    of the different clusters generated'''
    global_feat_prototypes = {}
    for split_idx, cl_result in cl_results.items():
        # Create { split_idx : feat_prototypes_per_cluster } pairs
        global_feat_prototypes[split_idx] = gen_indiv_feat_prototypes(cl_result, feat_prt_type, ignore_noise, **kwargs)
    return global_feat_prototypes

In [112]:
# Warning: This code access variables defined previously without referencing them in its arguments.
def get_prototypes(vote_prt_type, feat_prt_type):
    # Calculate vote prototypes
    vote_prts = None
    if vote_prt_type=='avg-votes':
        vote_prts = get_global_vote_prototypes(global_cl_results, votes_df, vote_prt_type)
    else: raise Exception(f'Unknown vote prot. type : {vote_prt_type}')
    # Calculate feature prototypes
    feat_prts = None
    if feat_prt_type=='avg-feats':
        feat_prts = get_global_feat_prototypes(global_cl_results, FEAT_PRT_TYPE, feats_df=feats_df)
    elif feat_prt_type=='pca':
        feat_prts = get_global_feat_prototypes(global_cl_results, FEAT_PRT_TYPE, images_folder_path=IMAGES_FOLDER_PATH)
    else: raise Exception(f'Unknown feature prot. type : {feat_prt_type}')
    # Return prototypes
    return vote_prts, feat_prts

### Calculate both prototypes

In [61]:
global_vote_prototypes, global_feat_prototypes = get_prototypes(VOTE_PRT_TYPE, FEAT_PRT_TYPE)

In [62]:
# Sanity check: No. of elements should be the same as no. of clusters detected in clustering phase
global_vote_prototypes[3]

{0: array([5, 4, 4, 0]),
 1: array([7, 2, 5, 1]),
 2: array([8, 2, 3, 2]),
 3: array([ 8, 10,  2,  0]),
 4: array([7, 2, 6, 0]),
 5: array([8, 2, 2, 1]),
 6: array([7, 4, 2, 0]),
 7: array([9, 2, 1, 2]),
 8: array([6, 5, 3, 0]),
 9: array([6, 3, 4, 0]),
 10: array([6, 5, 2, 1]),
 11: array([11,  2,  6,  0]),
 12: array([8, 0, 4, 0]),
 13: array([5, 2, 5, 0]),
 14: array([8, 2, 2, 0])}

In [63]:
# Sanity check: No. of elements should be the same as no. of clusters detected in clustering phase
if global_feat_prototypes is not None:
    print(global_feat_prototypes[3]) # float64

{0: array([0.0824641 , 0.21308363, 0.20724915, ..., 0.48475512, 0.29120767,
       0.23847589]), 1: array([1.37623495, 0.10536327, 0.06911659, ..., 0.7257839 , 0.24584723,
       0.11383936]), 2: array([0.05132507, 0.38199278, 0.25109835, ..., 0.17329019, 0.86275237,
       0.06237488]), 3: array([0.71722362, 0.09781803, 0.02666388, ..., 0.5039281 , 0.01320606,
       0.45912951]), 4: array([0.38268466, 0.20299782, 0.58064171, ..., 0.18404097, 0.06476327,
       0.16339689]), 5: array([0.08024365, 0.1204319 , 0.12581458, ..., 0.12010398, 0.03365019,
       0.41241854]), 6: array([0.33168829, 0.29847374, 0.37979215, ..., 0.4577658 , 0.16855236,
       0.18571813]), 7: array([0.05421157, 0.70195364, 0.04683721, ..., 1.08174687, 0.55553089,
       0.15072539]), 8: array([0.18724329, 0.09193769, 0.45493382, ..., 1.33935047, 0.21779195,
       0.2928563 ]), 9: array([0.48219949, 0.23743852, 0.23221617, ..., 0.29231778, 0.59580591,
       0.10933533]), 10: array([0.03560127, 0.19514388, 0.29

## Vote Count Prediction (using feat prototypes and vote prototypes)

#### Helper functions

In [103]:
def dist_to_prt(p1, p2, metric):
    if metric=='euclid': return np.linalg.norm(p1-p2)
    if metric=='cosine': return 1 - (np.dot(p1, p2)/(np.linalg.norm(p1) * np.linalg.norm(p2)))
    else: raise Exception(f'Unknown metric type : {metric}')

### Prototype-based VCP

In [114]:
def get_distances_to_feat_prts(test_img_name, feats_df, gbl_feat_prts, split_idx, metric, **kwargs):
    # Prepare data
    test_feats = feats_df.loc[test_img_name].values
    feat_prts = gbl_feat_prts[split_idx]
    # Calculate distances
    distances_per_feat_prt = {}
    for prt_idx, prt_feats in feat_prts.items():
        distances_per_feat_prt[prt_idx] = dist_to_prt(test_feats, prt_feats, metric)
    return distances_per_feat_prt

### PCA-based VCP

In [66]:
def get_pca_proj_distances(test_img_name, images_folder_path, gbl_feat_prts, split_idx, **kwargs):
    pass

### Prototypeless VCP

In [67]:
def get_img_idxs_per_cluster(cl_results, ignore_noise=True):
    img_idxs_per_cluster = {}
    for img_name, cl_idx in cl_results.items():
        if cl_idx==-1 and ignore_noise: continue # ignore noise cluster
        img_idx = np.argwhere(X_names == img_name)[0][0]
        if cl_idx not in img_idxs_per_cluster.keys():
            img_idxs_per_cluster[cl_idx] = [img_idx]
        else:
            img_idxs_per_cluster[cl_idx].append(img_idx)
    return img_idxs_per_cluster

def get_intercluster_distances(test_img_name, feats_df, cl_results, split_idx, metric, **kwargs):
    pass

### Main code

In [106]:
def get_nearest_prototypes_indices(dist_to_prototypes, k):
    '''Given a dictionary between feature prototypes indices and the distance to a point,
    return indices of nearest prototypes (i.e. those with the lowest distances).
    Indices of prototypes are NOT guaranteed to be ordered in terms of closeness'''
    # Return existing prototypes if k is higher than no. of prototypes
    if k >= len(dist_to_prototypes): return list(dist_to_prototypes.keys())
    # Else, find nearest prototypes
    nearest_prts_idxs = []
    # Iterate k times...
    for i in range(k):
        nearest_prt_idx, min_dist = None, np.inf
        # ...searching the next nearest prototype
        for prt_idx, dist in dist_to_prototypes.items():
            if prt_idx in nearest_prts_idxs: continue # ignore prev. found nearest prototypes
            if dist < min_dist: nearest_prt_idx, min_dist = prt_idx, dist
        nearest_prts_idxs.append(nearest_prt_idx)
    return nearest_prts_idxs

def gen_prediction(test_img_name, vote_prts, k, vote_pred_type, **kwargs):
    # Calculate distances to each cluster/prototype depending on vote_pred_type
    if vote_pred_type == 'prt-based':
        dist_to_prototypes = get_distances_to_feat_prts(test_img_name, **kwargs)
    elif vote_pred_type == 'pca-based':
        dist_to_prototypes = get_pca_proj_distances(test_img_name, **kwargs)
    elif vote_pred_type == 'prtless':
        dist_to_prototypes = get_intercluster_distances(test_img_name, **kwargs)
    else:
        raise Exception(f'Unknown vote prediction type : {vote_pred_type}')
    # Find k nearest prototypes
    kn_prototypes_idxs =  get_nearest_prototypes_indices(dist_to_prototypes, k)
    # Aggregate the vote prototypes of the clusters associated with those distances
    nearest_vote_prototypes = [vote_prts[kn_prt_idx] for kn_prt_idx in kn_prototypes_idxs]
    unrounded_vcp = np.average(np.array(nearest_vote_prototypes), axis=0)
    rounded_vcp = np.round(unrounded_vcp, 0).astype(np.int8)
    vote_count_prediction = rounded_vcp
    return vote_count_prediction

In [69]:
def get_indiv_vote_predictions(split, vote_prts, image_names, k, vote_pred_type, **kwargs):
    '''Maps the name of every test image with its generated vote count prediction'''
    vote_predictions = {}
    query_times = {}
    # Prepare data
    test_idxs = split[1]
    # For each test image...
    for test_img_idx in test_idxs:
        # Register query computation time
        start = timeit.default_timer()
        # Generate and store prediction (along with the image's name)
        kwargs['image_names'] = image_names
        test_img_name = image_names[test_img_idx]
        vote_count_prediction = gen_prediction(test_img_name, vote_prts, k, vote_pred_type, **kwargs)
        vote_predictions[test_img_name] = vote_count_prediction
        # Calculate and store query computation time
        end = timeit.default_timer()
        query_time = end - start
        query_times[test_img_name] = query_time
    return vote_predictions, query_times

def get_global_vote_predictions(splits, gbl_vote_prts, image_names, k, vote_pred_type, **kwargs):
    '''Maps indices of test folds/splits with the vote count predictions generated'''
    global_vote_predictions = {}
    global_query_times = {}
    for split_idx in gbl_vote_prts.keys():
        # Generate predictions and query times.
        kwargs['split_idx'] = split_idx # Some functions use this parameter
        vote_preds, query_times = get_indiv_vote_predictions(splits[split_idx], gbl_vote_prts[split_idx], image_names, k, vote_pred_type, **kwargs)
        # Create { split_idx: vote_predictions } and { split_idx : query_times } pairs
        global_vote_predictions[split_idx] = vote_preds
        global_query_times[split_idx] = query_times
    return global_vote_predictions, global_query_times

In [117]:
# Warning: This code access variables defined previously without referencing them in its arguments.
def get_predictions(k, vote_pred_type, sim_metric_id, **kwargs):
    predictions = None
    if vote_pred_type == 'prt-based':
        predictions = get_global_vote_predictions(splits, global_vote_prototypes, X_names, k, vote_pred_type,
                                                  feats_df=feats_df, gbl_feat_prts=global_feat_prototypes, metric=sim_metric_id)
    elif vote_pred_type == 'pca-based':
        predictions = get_global_vote_predictions(splits, global_vote_prototypes, X_names, k, vote_pred_type,
                                                 images_folder_path=IMAGES_FOLDER_PATH, gbl_pca_objs=global_feat_prototypes)
    elif vote_pred_type == 'prtless':
        predictions = get_global_vote_predictions(splits, global_vote_prototypes, X_names, k, vote_pred_type,
                                                  feats_df=feats_df, gbl_cl_results=global_cl_results, metric=sim_metric_id)
    else:
        raise Exception(f'Unknown vote prediction type : {vote_pred_type}')
    return predictions

##### A little function to evaluate different values for k

In [142]:
def batch_get_predictions(k_range, **kwargs):
    batch_predictions, batch_query_times = {}, {}
    for k in k_range:
        preds, query_times = get_predictions(k, **kwargs)
        batch_predictions[k] = preds
        batch_query_times[k] = query_times
    return batch_predictions, batch_query_times

### Generating predictions

In [120]:
# Sanity check / Remainder of constants
print(VOTE_PRED_TYPE, SIM_ID)

prt-based euclid


In [143]:
k_range = [1, 3, 5, 6, 11]
batch_results = batch_get_predictions(k_range, vote_pred_type=VOTE_PRED_TYPE, sim_metric_id=SIM_METRIC_ID)
batch_global_predictions, batch_global_query_times = batch_results

In [146]:
# For k=5, in the cluster #0, show generated predictions
batch_global_predictions[5][0]

{'2388889__hotdog__0.99999714.jpg': array([7, 3, 3, 1], dtype=int8),
 '2417881__zebra__0.9999945.jpg': array([8, 3, 3, 1], dtype=int8),
 '2403403__banana__0.9999926.jpg': array([7, 4, 2, 1], dtype=int8),
 '2381941__zebra__0.9999914.jpg': array([8, 3, 3, 1], dtype=int8),
 '2403741__zebra__0.99999523.jpg': array([8, 3, 3, 1], dtype=int8),
 '2404281__zebra__0.999998.jpg': array([8, 3, 3, 1], dtype=int8),
 '2416627__zebra__0.9999987.jpg': array([8, 3, 3, 1], dtype=int8),
 '2391964__flamingo__1.0.jpg': array([8, 3, 3, 1], dtype=int8),
 '2404583__umbrella__0.99999297.jpg': array([7, 3, 3, 1], dtype=int8),
 '2409637__four-poster__0.99999464.jpg': array([8, 3, 3, 1], dtype=int8),
 '2380669__parking_meter__0.9999993.jpg': array([8, 3, 3, 1], dtype=int8),
 '2411196__crane__0.9999995.jpg': array([8, 3, 3, 1], dtype=int8),
 '134__zebra__0.9999949.jpg': array([8, 3, 3, 1], dtype=int8),
 '2405905__traffic_light__0.99999535.jpg': array([7, 3, 3, 1], dtype=int8),
 '2404127__zebra__0.9999933.jpg': arra

In [147]:
# For k=5, in the cluster #0, show query times
batch_global_query_times[5][0]

{'2388889__hotdog__0.99999714.jpg': 0.0009245999999620835,
 '2417881__zebra__0.9999945.jpg': 0.0009155000007012859,
 '2403403__banana__0.9999926.jpg': 0.0010788999998112558,
 '2381941__zebra__0.9999914.jpg': 0.0014029000003574765,
 '2403741__zebra__0.99999523.jpg': 0.0009532000003673602,
 '2404281__zebra__0.999998.jpg': 0.0009143999996013008,
 '2416627__zebra__0.9999987.jpg': 0.0009102999993046978,
 '2391964__flamingo__1.0.jpg': 0.000902200000382436,
 '2404583__umbrella__0.99999297.jpg': 0.0009089000004678383,
 '2409637__four-poster__0.99999464.jpg': 0.0009509999999863794,
 '2380669__parking_meter__0.9999993.jpg': 0.0009226000001945067,
 '2411196__crane__0.9999995.jpg': 0.000929300000279909,
 '134__zebra__0.9999949.jpg': 0.0010785999993458972,
 '2405905__traffic_light__0.99999535.jpg': 0.0030346000003191875,
 '2404127__zebra__0.9999933.jpg': 0.0009722000004330766,
 '2406857__zebra__0.9999894.jpg': 0.0009160999998130137,
 '2414277__zebra__0.9999908.jpg': 0.0009008999995785416,
 '2385298

## Metric Evaluation

### Vote Accuracy (RMSE, Manhattan, etc...)

In [64]:
def calc_distance(p1, p2, metric):
    if metric == 'rmse': return np.sum(np.square(p1 - p2))
    elif metric == 'manhattan': return np.sum(np.abs(p1 - p2))
    else: print('Unknown metric type')
        
def eval_indiv_vote_preds(vote_predictions, metric):
    vote_distances = []
    for img_name, vote_pred in vote_predictions.items():
        # Fetch real votes and compare with vote predictions
        real_votes = votes_df.loc[img_name].values[:4]
        distance = calc_distance(real_votes, vote_pred, metric)
        vote_distances.append(distance)
    vote_distances = np.array(vote_distances)
    if metric=='rmse': metrics = {'rmse': round(np.sqrt(np.average(vote_distances)), 2)}
    elif metric=='manhattan': metrics = {'manhattan': (np.average(vote_distances), 2)}
    else:
        metrics = {
            'average': round(np.average(vote_distances), 2),
            'std. dev.': round(np.std(vote_distances), 2),
            'range': [round(np.min(vote_distances), 2), round(np.max(vote_distances), 2)],
        }
    return metrics

def eval_global_vote_preds(global_vote_predictions, metric):
    global_metrics = {}
    # Calculate metrics for each split
    for split_idx, vote_predictions in global_vote_predictions.items():
        global_metrics[split_idx] = eval_indiv_vote_preds(vote_predictions, metric)
    # Aggregate metrics for all splits
    global_metrics['global'] = {}
    for metric_type in global_metrics[0].keys():
        metrics_per_type = [metrics[metric_type] for split_key, metrics in global_metrics.items() if split_key != 'global']
        avgd_metrics_per_type = np.round(np.average(np.array(metrics_per_type), axis=0), 2)
        global_metrics['global'][metric_type] = avgd_metrics_per_type
    return global_metrics

### Query Times

In [65]:
def eval_indiv_query_times(query_times):
    times = [query_time for _, query_time in query_times.items()]
    avg_time = np.average(np.array(times))
    return avg_time

def eval_global_query_times(global_query_times):
    global_metrics = {}
    for split_idx, query_times in global_query_times.items():
        global_metrics[split_idx] = eval_indiv_query_times(query_times)
    avg_times = [avg_time for _, avg_time in global_metrics.items()]
    global_avg_time = np.average(np.array(avg_times))
    return global_avg_time

In [66]:
METRIC = 'rmse'
global_vote_metrics_k1 = eval_global_vote_preds(global_vote_predictions_k1, metric=METRIC)
global_vote_metrics_k3 = eval_global_vote_preds(global_vote_predictions_k3, metric=METRIC)
global_vote_metrics_k5 = eval_global_vote_preds(global_vote_predictions_k5, metric=METRIC)
global_vote_metrics_k7 = eval_global_vote_preds(global_vote_predictions_k7, metric=METRIC)
global_vote_metrics_k9 = eval_global_vote_preds(global_vote_predictions_k9, metric=METRIC)
global_vote_metrics_k11 = eval_global_vote_preds(global_vote_predictions_k11, metric=METRIC)

In [67]:
print(global_vote_metrics_k1['global'])
print(eval_global_query_times(gqts_k1))

{'rmse': 4.91}
0.0004279884999998984


In [68]:
print(global_vote_metrics_k3['global'])
print(eval_global_query_times(gqts_k3))

{'rmse': 4.55}
0.00033947499999982257


In [69]:
print(global_vote_metrics_k5['global'])
print(eval_global_query_times(gqts_k5))

{'rmse': 4.81}
0.00034090300000013226


In [70]:
print(global_vote_metrics_k7['global'])
print(eval_global_query_times(gqts_k7))

{'rmse': 4.72}
0.00038593250000001683


In [71]:
print(global_vote_metrics_k9['global'])
print(eval_global_query_times(gqts_k9))

{'rmse': 4.57}
0.00037741649999997403


In [72]:
print(global_vote_metrics_k11['global'])
print(eval_global_query_times(gqts_k11))

{'rmse': 4.55}
0.00039934300000037836


At some point, RMSE was 4.91, 4.55, 4.81, 4.72, 4.57, 4.55!!!!!!