# Clustering Analysis

## PreAnalysis

### Loading Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [9]:
# Best params
RESULTS_PATH = os.path.join('..', 'results')
HIST_BEST = os.path.join(RESULTS_PATH, 'hist_best_params.csv')
LATENT_BEST = os.path.join(RESULTS_PATH, 'latent_best_params.csv')

In [18]:
# Data features
FEATURES_PATH = os.path.join('..', 'features')
LATENT_PATH = os.path.join(FEATURES_PATH, 'incv1_feats.csv')
HIST_PATH = os.path.join(FEATURES_PATH, 'color_hist.csv')

In [None]:
# Best techniques
BEST_TECHNIQUES = os.path.join(RESULTS_PATH, 'all.csv')

We load the best params to include them all into a nice visual dataframe.

In [114]:
hist_params = pd.read_csv(HIST_BEST)
latent_params = pd.read_csv(LATENT_BEST)

In [115]:
dbscan_best_params = pd.concat([hist_params, latent_params], axis=0, ignore_index=True)
dbscan_best_params

Unnamed: 0,m,e,data,scaled,similarity,sscore,clusters,instances
0,2,0.16,hist,False,cosine,0.44,2,147
1,2,0.19,hist,False,cosine,0.37,3,163
2,4,11.8,latent,False,euclid,0.46,9,140
3,3,0.2,latent,False,cosine,0.68,11,144


We load the original features so we can cluster them using the best params

In [19]:
latent_feats = pd.read_csv(LATENT_PATH)
latent_feats.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1222__pool_table__0.9999995.jpg,0.882798,0.896023,0.123852,0.257982,0.03605,0.108023,0.633841,0.457301,1.684949,...,0.422634,0.346122,0.111589,1.441579,0.198722,0.246648,0.295942,0.56095,0.058328,0.117393
1,1328__coil__0.99999607.jpg,0.483815,0.134309,0.021849,0.367267,0.08925,0.007518,0.069921,0.219347,0.08926,...,0.049852,0.00414,0.199223,0.718976,0.0,0.0,0.0,0.159411,0.012007,0.001601
2,134__zebra__0.9999949.jpg,0.291067,0.375913,0.217742,1.269691,0.384181,0.07647,0.66207,0.662391,0.827774,...,0.018289,0.0,0.000775,0.903884,0.589769,0.016957,0.418493,0.00535,0.004198,0.18546


In [20]:
hist_feats = pd.read_csv(HIST_PATH)
hist_feats.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,1222__pool_table__0.9999995.jpg,178,51,43,49,37,40,54,57,57,...,8,5,12,9,13,14,12,12,7,51
1,1328__coil__0.99999607.jpg,47,39,66,118,112,134,143,164,194,...,97,114,127,188,211,172,121,90,61,186
2,134__zebra__0.9999949.jpg,0,0,1,1,4,4,7,5,12,...,34,17,40,14,25,12,2,4,2,13


We load the best interpretation techniques selected for each image, and we convert it to a map we can easily consult

In [89]:
techniques = pd.read_csv(BEST_TECHNIQUES, sep=';', header=None, dtype='object')
techniques.head(3)

Unnamed: 0,0,1,2,3
0,1222,pool_table,0.9999995,lime
1,1328,coil,0.99999607,ig
2,134,zebra,0.9999949,xrai


In [177]:
def gen_name_technique_tuples(x):
    return ['__'.join([str(x[0]), x[1], str(x[2])]) + '.jpg', x[3]]

In [178]:
foo = techniques.apply(gen_name_technique_tuples, axis=1)
foo.values[:3]

array([list(['1222__pool_table__0.9999995.jpg', 'lime']),
       list(['1328__coil__0.99999607.jpg', 'ig']),
       list(['134__zebra__0.9999949.jpg', 'xrai'])], dtype=object)

In [179]:
name_tech_map = {name: tech for name, tech in foo.values}

## Clustering

In [108]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [120]:
feats_map = {
    'hist': hist_feats,
    'latent': latent_feats
}

In [125]:
sim_metric_map = {
    'euclid': 'euclidean',
    'cosine': 'cosine'
}

In [121]:
dbscan_best_params.values

array([[2, 0.16, 'hist', False, 'cosine', 0.44, 2, 147],
       [2, 0.19, 'hist', False, 'cosine', 0.37, 3, 163],
       [4, 11.8, 'latent', False, 'euclid', 0.46, 9, 140],
       [3, 0.2, 'latent', False, 'cosine', 0.68, 11, 144]], dtype=object)

In [163]:
def get_clustering_results(params, feats_types, metric_types):
    results = {}
    for i, param_set in enumerate(params):
        # Prepare parameters
        data = feats_types[param_set[2]]
        img_names = data.values[:, 0]
        instances = data.values[:, 1:]
        metric = metric_types[param_set[4]]
        # Do clustering
        dbscan = DBSCAN(min_samples=param_set[0], eps=param_set[1], metric=metric)
        dbscan = dbscan.fit(instances)
        # Generate {img_name : label} mapping
        name_label_map = {name: label for name, label in zip(img_names, dbscan.labels_)}
        results[i] = name_label_map
    return results

In [164]:
res = get_clustering_results(dbscan_best_params.values, feats_map, sim_metric_map)

In [165]:
np.unique(list(res[3].values()))

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [148]:
dbscan_best_params

Unnamed: 0,m,e,data,scaled,similarity,sscore,clusters,instances
0,2,0.16,hist,False,cosine,0.44,2,147
1,2,0.19,hist,False,cosine,0.37,3,163
2,4,11.8,latent,False,euclid,0.46,9,140
3,3,0.2,latent,False,cosine,0.68,11,144


Param set # 2 is the most variated so far. We need a way to enforce a min. no. of instances on every cluster, because right now every clustering obteined has one cluster with many instances and the rest of the clusters only contain two or ther clustered instances

In [172]:
res[2]

{'1222__pool_table__0.9999995.jpg': -1,
 '1328__coil__0.99999607.jpg': -1,
 '134__zebra__0.9999949.jpg': 0,
 '2377471__pizza__0.9999988.jpg': 1,
 '2377620__zebra__0.9999882.jpg': 0,
 '2377698__zebra__0.9999999.jpg': 0,
 '2378170__zebra__0.9999902.jpg': 0,
 '2378358__park_bench__0.99999833.jpg': -1,
 '2378523__banana__0.99999785.jpg': 2,
 '2379086__zebra__0.9999975.jpg': 0,
 '2379489__parking_meter__0.9999989.jpg': 3,
 '2380017__zebra__0.9999995.jpg': 0,
 '2380019__zebra__0.9999926.jpg': 0,
 '2380189__zebra__0.9999993.jpg': 0,
 '2380319__broccoli__0.9999957.jpg': 4,
 '2380447__bullet_train__0.9999869.jpg': -1,
 '2380669__parking_meter__0.9999993.jpg': 3,
 '2380865__traffic_light__0.99999714.jpg': -1,
 '2380905__gondola__0.9999888.jpg': -1,
 '2380925__zebra__0.9999987.jpg': 0,
 '2381648__zebra__0.9999995.jpg': 0,
 '2381879__zebra__0.99999523.jpg': 0,
 '2381932__traffic_light__0.99999964.jpg': -1,
 '2381941__zebra__0.9999914.jpg': 0,
 '2381968__ski__0.999984.jpg': 5,
 '2382183__pizza__0.9

## Clustering Analysis

In [169]:
def get_clustering_analysis(name_labels_map, name_tech_map):
    # clusters = np.unique(list(name_labels_map.values()))
    cluster_stats = {}
    for name, label in name_labels_map.items():
        if (label == -1): continue # ignore noise cluster
        # Add cluster no. to stats if it has not been added early
        if (label not in cluster_stats.keys()): cluster_stats[label] = {}
        # Get technique
        technique = name_tech_map[name]
        # If technique is in cluster stats
        if (technique in cluster_stats[label].keys()):
            cluster_stats[label][technique] += 1 # add one...
        else:
            cluster_stats[label][technique] = 1 # else, create with one
    return cluster_stats

In [180]:
clustering_no2_res = get_clustering_analysis(res[2], name_tech_map)

Best techniques frecuencies per cluster in clustering # 2 (m=4 and e=11.8 with latent features and euclidean metric)

In [181]:
clustering_no2_res

{0: {'xrai': 13, 'ig': 54, 'lime': 7, 'anchor': 4},
 1: {'ig': 11},
 2: {'ig': 5},
 3: {'ig': 13, 'xrai': 3, 'lime': 2},
 4: {'ig': 6},
 5: {'xrai': 2, 'ig': 3},
 8: {'ig': 5},
 6: {'ig': 8},
 7: {'ig': 4}}

## TODO: Create soft voting metrics for intracluster evaluation

In [192]:
len(name_tech_map)

202

In [194]:
counts = np.unique(list(name_tech_map.values()), return_counts=True)
counts

(array(['anchor', 'ig', 'lime', 'nan', 'xrai'], dtype='<U6'),
 array([ 11, 147,  11,   4,  29], dtype=int64))

In [189]:
foo

0       [1222__pool_table__0.9999995.jpg, lime]
1              [1328__coil__0.99999607.jpg, ig]
2             [134__zebra__0.9999949.jpg, xrai]
3           [2377471__pizza__0.9999988.jpg, ig]
4         [2377620__zebra__0.9999882.jpg, lime]
                         ...                   
3065    [1222__pool_table__0.9999995.jpg, xrai]
3066           [1328__coil__0.99999607.jpg, ig]
3067          [134__zebra__0.9999949.jpg, xrai]
3068        [2377471__pizza__0.9999988.jpg, ig]
3069      [2377620__zebra__0.9999882.jpg, xrai]
Length: 3070, dtype: object

In [190]:
techniques

Unnamed: 0,0,1,2,3
0,1222,pool_table,0.9999995,lime
1,1328,coil,0.99999607,ig
2,134,zebra,0.9999949,xrai
3,2377471,pizza,0.9999988,ig
4,2377620,zebra,0.9999882,lime
...,...,...,...,...
3065,1222,pool_table,0.9999995,xrai
3066,1328,coil,0.99999607,ig
3067,134,zebra,0.9999949,xrai
3068,2377471,pizza,0.9999988,ig
