# Vote Count Prediction for new Images

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn

## Loading Data

In [2]:
FEATS_FILE_PATH = os.path.join('..', 'features', 'incv1_feats.csv')
VOTES_FILE_PATH = os.path.join('..', 'results', 'votes_summary.csv')

#### Data (Features)

In [3]:
feats_df = pd.read_csv(FEATS_FILE_PATH, index_col=0)
feats_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1222__pool_table__0.9999995.jpg,0.882798,0.896023,0.123852,0.257982,0.03605,0.108023,0.633841,0.457301,1.684949,0.285681,...,0.422634,0.346122,0.111589,1.441579,0.198722,0.246648,0.295942,0.56095,0.058328,0.117393
1328__coil__0.99999607.jpg,0.483815,0.134309,0.021849,0.367267,0.08925,0.007518,0.069921,0.219347,0.08926,0.046694,...,0.049852,0.00414,0.199223,0.718976,0.0,0.0,0.0,0.159411,0.012007,0.001601
134__zebra__0.9999949.jpg,0.291067,0.375913,0.217742,1.269691,0.384181,0.07647,0.66207,0.662391,0.827774,0.115826,...,0.018289,0.0,0.000775,0.903884,0.589769,0.016957,0.418493,0.00535,0.004198,0.18546


#### Votes

In [4]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


Here's a sanity check for vote proportion in our the dataset. In the original XAI-CBR paper, vote proportion was like this:
- IG: 45%
- XRAI: 30%
- LIME: 18%
- ANCHOR: 7%

Also, IG was the most voted technique, at least by hard voting aggregation, with a majority of 62% images.


In [5]:
votes_df[['ig','lime','xrai','anchor']].sum() / 2867

ig        0.488315
lime      0.183467
xrai      0.271713
anchor    0.056505
dtype: float64

There's a slight imbalance of these proportions with respect to ones presented in the paper. It seems like some votes from XRAI and ANCHOR techniques drifted out to the IG technique. We'll check this out later, this should not be of great importance in the experiments of this notebook.

### Data Preprocessing

In [6]:
X = feats_df.values
X_names = feats_df.index.values
y = votes_df.values[:, :4]
best = votes_df.values[:, -1]

In [7]:
print(X.shape, X_names.shape, y.shape, best.shape)

(198, 1024) (198,) (198, 4) (198,)


#### Instance deletion
Stratified Subsampling cannot be performed onto the dataset because only one instance is best explained with ANCHOR. Due to the very small importance of that instance in the dataset, we will continue without that instance (i.e. we will find that instance and remove it from the dataset).

In [8]:
anchor_idxs = np.argwhere(best == 'anchor')[0]
anchor_idxs

array([155], dtype=int64)

In [9]:
X_names[anchor_idxs], best[anchor_idxs]

(array(['2411942__zebra__0.99999654.jpg'], dtype=object),
 array(['anchor'], dtype=object))

In [10]:
# Delete that instance from all data partitions (X, y, etc.)
X = np.delete(X, anchor_idxs, axis=0)
X_names = np.delete(X_names, anchor_idxs, axis=0)
y = np.delete(y, anchor_idxs, axis=0)
best = np.delete(best, anchor_idxs, axis=0)

In [11]:
print(X.shape, X_names.shape, y.shape, best.shape)

(197, 1024) (197,) (197, 4) (197,)


## Splitting and Fold Creation

In [12]:
from sklearn.model_selection import StratifiedShuffleSplit as SSS
from sklearn.model_selection import ShuffleSplit as SS

#### TODO: Should I perform statified subsampling or standard subsampling?

In [13]:
STRATIFIED = True

In [14]:
# Perform split
splitter = None
if STRATIFIED: splitter = SSS(n_splits=5, test_size=0.2, random_state=42)
else: splitter = SS(n_splits=5, test_size=0.2, random_state=42)
splits = splitter.split(X, best)
splits = list(splits)

In [15]:
splits[0]

(array([192, 147, 177,  11, 140,  51, 127, 118, 172, 191,  62, 124, 115,
         80, 190, 142,  92,  69,  25,  14,  42,   3, 185,  90,  10,  76,
        176, 114,  44,  98, 166, 121,  79, 170,   1, 183,  28,  31, 155,
         75, 156, 101, 171,  13, 110, 122,  38,  27, 136,  20,   6,  56,
         35,  59, 139,  33,  78,  82,  21, 167, 117,  12,  49,  15,   5,
        152, 132,  81,  61, 163, 175,  91,   7, 174, 135,  74, 193, 129,
         60,  96,  50, 161, 159, 145, 126,  19,  65, 188,  73,  89, 133,
        179,  40,  86, 112,  26, 168, 189, 149,  94, 194,  18, 138, 169,
        102,  97,  71, 130,  53,  99, 148, 154,   8,  34, 182, 105,  55,
         95, 153,  72, 144,  77,  52,  30,   9,  37,   4,  93, 128, 137,
        195, 160, 111,  45, 164, 151,  29,  48,  70,  43,  57, 157,  39,
        141,  85, 150,  67,   0,  47, 113,  32,  17, 131, 180,  66, 100,
        186], dtype=int64),
 array([ 54, 187, 103,  23, 104, 108, 181,  64, 109, 134,  16, 146,   2,
        116, 106, 119, 

## Clustering

In [16]:
clusterable_params = []

In [17]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [18]:
def fit_dbscan(data, min_samples, eps_values, 
               min_no_clusters=2, max_no_clusters=np.inf,
               min_clust_instances=None, min_clust_instances_pct=0.85,
               max_clust_instances=np.inf, metric='euclidean'):
    # Condition precalculation
    if min_clust_instances_pct:
        min_clust_instances = round(data.shape[0] * min_clust_instances_pct)
    elif not min_clust_instances:
        min_clust_instances = 100
    # Code
    scores, clusters, instances = [], [], []
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric=metric).fit(data)
            # Get only non anomalous instances
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions (why does it output NaN and not None?)
            valid_n_clusters = n_clusters >= min_no_clusters and n_clusters <= max_no_clusters
            valid_n_cl_instances = n_instances >= min_clust_instances and n_instances <= max_clust_instances
            if (valid_n_clusters and valid_n_cl_instances):
                score = silhouette_score(data[non_a], db.labels_[non_a], metric=metric)
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

In [19]:
def print_results(m, eps, scores_df, instances_df, clusters_df):
    score = round(scores_df.loc[m][eps], 4)
    instances = instances_df.loc[m][eps]
    clusters = clusters_df.loc[m][eps]
    print(f'DBSCAN using parameters m={m} and eps={eps} yields the next clustering results:')
    print()
    print(f'- Sil. score: {score}')
    print(f'- {instances} clustered instances into {clusters} clusters')
    print(f'- Avg. of {round(instances/clusters, 2)} instances per cluster')

In [20]:
X[splits[0][0]].shape[0] * 0.85 # about 135 clustered instances are needed

133.45

#### Split #0

In [21]:
dfs, dfc, dfi = fit_dbscan(X[splits[0][0]], range(2, 6), range(11, 18))
dfs

Epsilon,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,,0.376409,0.309939,0.226041,0.16975,0.205307
3,,,,0.306802,0.254193,0.207208,0.201926
4,,,,,0.262017,,
5,,,,,0.262017,,


In [22]:
print_results(2, 13, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=13 yields the next clustering results:

- Sil. score: 0.3764
- 138 clustered instances into 16 clusters
- Avg. of 8.62 instances per cluster


In [23]:
clusterable_params.append([2, 13, 0])

#### Split #1

In [24]:
dfs, dfc, dfi = fit_dbscan(X[splits[1][0]], range(2, 7), range(11, 18))
dfs

Epsilon,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,0.405127,0.388931,0.329026,0.233712,0.214392,0.279872
3,,,,0.324064,0.235581,0.347616,0.336508
4,,,,,0.235581,0.347616,0.336508
5,,,,,0.24272,0.347616,0.336508
6,,,,,0.234807,,


In [25]:
print_results(2, 12, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=12 yields the next clustering results:

- Sil. score: 0.4051
- 135 clustered instances into 19 clusters
- Avg. of 7.11 instances per cluster


In [26]:
clusterable_params.append([2, 12, 1])

#### Split #2

In [27]:
dfs, dfc, dfi = fit_dbscan(X[splits[2][0]], range(2, 7), range(11, 18))
dfs

Epsilon,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,0.418787,0.41248,0.277547,0.230598,0.24841,0.244132
3,,,0.4102,0.269285,0.257609,,
4,,,,0.269285,0.257609,,
5,,,,0.275717,0.260715,,
6,,,,,0.254598,,


In [28]:
print_results(2, 12, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=12 yields the next clustering results:

- Sil. score: 0.4188
- 136 clustered instances into 17 clusters
- Avg. of 8.0 instances per cluster


In [29]:
clusterable_params.append([2, 12, 2])

#### Split #3

In [30]:
dfs, dfc, dfi = fit_dbscan(X[splits[3][0]], range(2, 7), range(11, 18))
dfs

Epsilon,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,0.403343,0.323821,0.278651,0.210186,0.184646,0.296871
3,,,,0.273266,0.243847,,
4,,,,0.270225,0.243847,,
5,,,,,0.242323,,
6,,,,,0.251365,,


In [31]:
print_results(2, 12, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=12 yields the next clustering results:

- Sil. score: 0.4033
- 135 clustered instances into 19 clusters
- Avg. of 7.11 instances per cluster


In [32]:
clusterable_params.append([2, 12, 3])

#### Split #4

In [33]:
dfs, dfc, dfi = fit_dbscan(X[splits[4][0]], range(2, 7), range(11, 18))
dfs

Epsilon,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,,0.372388,0.274406,0.216593,0.162722,
3,,,0.398123,0.27267,0.262001,,
4,,,,0.277303,0.262001,,
5,,,,,0.265082,,
6,,,,,0.274071,,


In [34]:
print_results(3, 13, dfs, dfi, dfc)

DBSCAN using parameters m=3 and eps=13 yields the next clustering results:

- Sil. score: 0.3981
- 133 clustered instances into 12 clusters
- Avg. of 11.08 instances per cluster


In [35]:
clusterable_params.append([3, 13, 4])

#### Clusterable parameters for each split

In [36]:
clusterable_params

[[2, 13, 0], [2, 12, 1], [2, 12, 2], [2, 12, 3], [3, 13, 4]]

## Clustering Results

In [37]:
def get_indiv_clustering_results(params):
    '''Returns a dictionary mapping the name of an image
    with the cluster it belongs'''
    # Preconditions
    split_idx = params[2]
    train_idxs = splits[split_idx][0]
    # Prepare data (always X, not feats_df)
    instances = X[train_idxs]
    img_names = X_names[train_idxs]
    # Perform clustering
    dbscan = DBSCAN(min_samples=params[0], eps=params[1])
    dbscan = dbscan.fit(instances)
    # Generate {img_name : label} mapping
    name_label_map = {name: label for name, label in zip(img_names, dbscan.labels_)}
    return name_label_map

def get_global_clustering_results(params_set):
    '''Returns a dictionary mapping the index of every param set
    in 'params' arg. with the clustering results generated with that param. set'''
    results = {}
    for i, params in enumerate(params_set):
        results[i] = get_indiv_clustering_results(params)
    return results

In [38]:
cl_results = get_global_clustering_results(clusterable_params)

In [39]:
# A little sanity check...
# Number of elements should be the same as clusters detected in clustering phase
for i in range(5): print(len(np.unique(list(cl_results[i].values())))-1)

16
19
17
19
12


## Clustering Prototypes

In our experiment, we want to predict the vote count for a new image, based on the proximity it has to the avaliable clusters. These clusters are composed of many data points, so the proximity of a new data point to a cluster can be measured in different ways, like taking the distance between the new point and the nearest clustered point in the dataset.   
However, this approach can be biased when new poins get associated to the cluster taking in account the nearest point of a cluster instead of the overall position of a cluster. To avoid this, for each cluster we calculate a "prototype", a data point which is the centroid of all the data points in a cluster. This way, we can measure the distance to the general position of a cluster in a more confident way.

In [40]:
def gen_indiv_cl_prototypes(cl_result, ignore_noise=True):
    # Separate image data ("instances") according to the clusters they belong to
    instances_by_cluster = {}
    for img_name, cl_idx in cl_result.items():
        if ignore_noise and cl_idx == -1: continue # ignore noise cluster
        img_data = feats_df.loc[img_name].values
        if cl_idx not in instances_by_cluster.keys(): instances_by_cluster[cl_idx] = [img_data]
        else: instances_by_cluster[cl_idx].append(img_data)
    # For each cluster, calculate their prototype
    prototypes_per_cluster = {}
    for cl_idx, instances in instances_by_cluster.items():
        prototypes_per_cluster[cl_idx] = np.average(instances, axis=0)
    return prototypes_per_cluster
    
def get_global_cl_prototypes(cl_results, ignore_noise=True):
    global_prototypes = {}
    for i, cl_result in cl_results.items():
        global_prototypes[i] = gen_indiv_cl_prototypes(cl_result, ignore_noise=ignore_noise)
    return global_prototypes

In [41]:
cls_prototypes = get_global_cl_prototypes(cl_results)

In [42]:
cls_prototypes[3]

{0: array([0.22467551, 0.11280407, 0.21147409, ..., 0.38322743, 0.25229493,
        0.89975796]),
 1: array([0.51009891, 0.98290047, 0.26727166, ..., 0.02832537, 0.03044627,
        0.0297337 ]),
 2: array([0.49321438, 2.33297098, 0.36776565, ..., 0.41373079, 0.01659491,
        0.57865075]),
 3: array([0.66188401, 0.93429005, 0.11197158, ..., 0.88486853, 0.21662746,
        0.12339265]),
 4: array([0.64520924, 0.08138089, 0.06071533, ..., 0.43601647, 0.08461746,
        0.51307747]),
 5: array([0.12109569, 0.45035109, 1.30916047, ..., 0.29143412, 1.8673538 ,
        0.2971926 ]),
 6: array([0.58922106, 0.23616834, 0.32533013, ..., 0.49715182, 0.29967682,
        0.46152148]),
 7: array([0.71469863, 0.22950385, 0.15569085, ..., 0.20414911, 0.17806031,
        0.49833884]),
 8: array([0.20415816, 1.58828342, 0.25630073, ..., 0.84833775, 0.44009239,
        0.46605633]),
 9: array([0.19464745, 0.25747797, 0.13683305, ..., 0.73445879, 0.00406807,
        0.08590543]),
 10: array([0.114571

## Vote Count Prediction

In [43]:
np.average(np.array(list(cls_prototypes[0].values())), axis=0)

array([0.35703897, 0.43481843, 0.38619751, ..., 0.37878752, 0.40115807,
       0.27475377])

In [44]:
def euclid_dist(point1, point2):
    return np.sqrt(np.sum(np.square(point1 - point2)))

def find_nearest_prts_idxs(instance, prototypes, k=3):
    # Prepare distances
    distances = [euclid_dist(instance, prt) for prt in prototypes]
    distances_idxs = list(range(len(distances)))
    # Let's do this xd
    nearest_prt_idxs = []
    if k > len(distances): return None
    elif k == len(distances): return distances_idx
    else:
        # Please, k times...
        for i in range(k):
            # ...find the index of the nearest point...
            min_dist, min_dist_idx, min_j = np.inf, None, None
            for j, dist in enumerate(distances):
                if dist < min_dist: min_dist, min_dist_idx, min_j = dist, distances_idxs[j], j
            #  ...store it
            nearest_prt_idxs.append(min_dist_idx)
            # ..and repeat process without that nearest point
            distances.pop(min_j)
            distances_idxs.pop(min_j)
    return nearest_prt_idxs

def get_indiv_vote_distances(prototypes, split_idx, k=3):
    # Prepare data
    test_idxs = splits[split_idx][1]
    test_instances = X[test_idxs]
    test_img_names = X_names[test_idxs]
    prototypes = np.array(list(prototypes.values()))
    # For each test image, store the distance to the average of its nearest prototypes...
    distances = {}
    for instance, img_name in zip(test_instances, test_img_names):
        nearest_prototypes_idxs = find_nearest_prts_idxs(instance, prototypes, k=k)
        avg_prototype = np.average(prototypes[nearest_prototypes_idxs], axis=0)
        distances[img_name] = euclid_dist(instance, avg_prototype)
    return distances

def get_global_vote_distances(all_prototypes, k=3):
    global_vote_distances = {}
    for split_idx, local_prototypes in all_prototypes.items():
        global_vote_distances[split_idx] = get_indiv_vote_distances(local_prototypes, split_idx, k=k)
    return global_vote_distances

In [45]:
global_vote_distances = get_global_vote_distances(cls_prototypes)

In [46]:
global_vote_distances

{0: {'2388889__hotdog__0.99999714.jpg': 15.086866221971677,
  '2417881__zebra__0.9999945.jpg': 8.44093593218563,
  '2403403__banana__0.9999926.jpg': 12.326415877836208,
  '2381941__zebra__0.9999914.jpg': 9.06846658904337,
  '2403741__zebra__0.99999523.jpg': 9.54811721409925,
  '2404281__zebra__0.999998.jpg': 8.636046251648438,
  '2416627__zebra__0.9999987.jpg': 8.01400750330919,
  '2391964__flamingo__1.0.jpg': 12.22487732683062,
  '2404583__umbrella__0.99999297.jpg': 10.21449717570772,
  '2409637__four-poster__0.99999464.jpg': 14.262922569652927,
  '2380669__parking_meter__0.9999993.jpg': 8.390590637183786,
  '2411196__crane__0.9999995.jpg': 13.588600816801142,
  '134__zebra__0.9999949.jpg': 9.732536579840666,
  '2405905__traffic_light__0.99999535.jpg': 10.148997761291337,
  '2404127__zebra__0.9999933.jpg': 10.757450041404939,
  '2406857__zebra__0.9999894.jpg': 8.01455082893586,
  '2414277__zebra__0.9999908.jpg': 7.778445648758404,
  '2385298__parking_meter__0.9999865.jpg': 10.94959846

#### TODO: Should distance be vote-based (i.e. nominal) or vote proportion-based (i.e. relative)?

## Metric Evaluation

In [157]:
def eval_indiv_rmse_vote_dist(vote_distances):
    vote_distances = list(vote_distances.values())
    metrics = {
        'average': round(np.average(vote_distances), 2),
        'std. dev.': round(np.std(vote_distances), 2),
        'range': [round(np.min(vote_distances), 2), round(np.max(vote_distances), 2)],
    }
    return metrics

def eval_global_vote_dist(global_vote_distances, mode='rmse'):
    global_metrics = {}
    # Calculate metrics for each split
    for cl_key, vote_distances in global_vote_distances.items():
        if mode=='rmse': metrics = eval_indiv_rmse_vote_dist(vote_distances)
        else: pass # For technique-wise vote distances
        global_metrics[cl_key] = metrics
    # Aggregate metrics for all splits
    global_metrics['global'] = {}
    for metric_type in global_metrics[0].keys():
        metrics_per_type = [metrics[metric_type] for split_key, metrics in global_metrics.items() if split_key != 'global']
        avgd_metrics_per_type = np.round(np.average(np.array(metrics_per_type), axis=0), 2)
        if metric_type == 'range': avgd_metrics_per_type = list(avgd_metrics_per_type)
        global_metrics['global'][metric_type] = avgd_metrics_per_type
    return global_metrics

In [158]:
global_vote_metrics = eval_global_vote_dist(global_vote_distances)

In [159]:
global_vote_metrics

{0: {'average': 10.52, 'std. dev.': 2.06, 'range': [7.78, 17.29]},
 1: {'average': 10.87, 'std. dev.': 2.28, 'range': [7.39, 16.41]},
 2: {'average': 11.02, 'std. dev.': 2.37, 'range': [7.98, 16.32]},
 3: {'average': 11.39, 'std. dev.': 2.87, 'range': [8.03, 18.92]},
 4: {'average': 11.02, 'std. dev.': 2.66, 'range': [7.82, 18.7]},
 'global': {'average': 10.96, 'std. dev.': 2.45, 'range': [7.8, 17.53]}}