# Clustering using Latent Features (from InceptionV3)

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
import sklearn

In [3]:
# Loading IncV1 latent features
SIM_MX_EUCLID_FILE_PATH = os.path.join('..', 'results', 'matrices', 'color_hist_euclid_sim_matrix.csv')
SIM_MX_COSINE_FILE_PATH = os.path.join('..', 'results', 'matrices', 'color_hist_cosine_sim_matrix.csv')
VOTES_FILE_PATH = os.path.join('..', 'results', 'votes_summary.csv')

### Loading Data

#### Similarity Matrices

In [4]:
sim_mx_euclid_df = pd.read_csv(SIM_MX_EUCLID_FILE_PATH, index_col=0)
sim_mx_euclid_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,6285.621051,14593.537679,5772.380618,11441.225721,11880.166834,9123.23835,5983.605101,7356.368941,8848.581129,...,10007.829435,8019.789274,10733.466448,9367.552722,8243.123316,7766.734835,7264.524073,9446.944797,7846.279883,8571.525069
1328__coil__0.99999607.jpg,6285.621051,0.0,12651.202235,4392.810718,9290.19031,10232.351538,6772.767824,4410.541237,7820.760705,6133.401177,...,7638.769273,5194.897304,8131.912321,6862.227918,6088.735008,7629.691999,3859.095749,7329.847611,5987.893787,6005.23505
134__zebra__0.9999949.jpg,14593.537679,12651.202235,0.0,13183.485427,15001.600315,14800.066284,12944.626144,14127.997947,13790.634141,13337.084014,...,13134.598129,13174.04896,11249.446564,13218.859331,11597.730726,14856.417065,11823.161168,13457.201195,13181.74116,12538.407634


In [5]:
sim_mx_cosine_df = pd.read_csv(SIM_MX_COSINE_FILE_PATH, index_col=0)
sim_mx_cosine_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,0.263312,0.88851,0.215065,0.708969,0.767865,0.593644,0.222727,0.321178,0.553927,...,0.70074,0.451223,0.756252,0.625707,0.478438,0.32144,0.365356,0.57988,0.406442,0.522883
1328__coil__0.99999607.jpg,0.263312,0.0,0.772453,0.206157,0.569966,0.711564,0.474664,0.143471,0.456048,0.385842,...,0.576,0.27986,0.58696,0.484427,0.384185,0.352028,0.156797,0.468385,0.323888,0.377533
134__zebra__0.9999949.jpg,0.88851,0.772453,0.0,0.847642,0.896885,0.874984,0.799678,0.882276,0.803249,0.855287,...,0.810574,0.835181,0.543752,0.838339,0.611558,0.871917,0.645131,0.813053,0.791438,0.744133


#### Votes

In [6]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


## DBSCAN Clustering

In [7]:
dbscan_best_params = []

In [8]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [9]:
def get_sim_mx_subset(sim_mx_values, filter_idxs):
    return sim_mx_values.take(filter_idxs, axis=0).take(filter_idxs, axis=1)

In [10]:
def fit_dbscan_sim_mx(data, min_samples, eps_values, 
               min_no_clusters=2, max_no_clusters=np.inf,
               min_clust_instances=None, min_clust_instances_pct=0.7,
               max_clust_instances=np.inf):
    # Condition precalculation
    if min_clust_instances_pct: # If % was defined
        min_clust_instances = round(data.shape[0] * min_clust_instances_pct)
    elif not min_clust_instances: # Else, if nominal amount was not specified
        min_clust_instances = 100
    # Code
    scores, clusters, instances = [], [], []
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric='precomputed').fit(data)
            # Get only non anomalous instances and indices
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            non_a_idxs = np.argwhere(non_a==True)
            non_a_idxs = non_a_idxs.reshape(non_a_idxs.shape[0])
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions (why does it output NaN and not None?)
            valid_n_clusters = n_clusters >= min_no_clusters and n_clusters <= max_no_clusters
            valid_n_cl_instances = n_instances >= min_clust_instances and n_instances <= max_clust_instances
            if (valid_n_clusters and valid_n_cl_instances):
                non_a_data = get_sim_mx_subset(data, non_a_idxs)
                score = silhouette_score(non_a_data, db.labels_[non_a], metric='precomputed')
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

In [11]:
def print_results(m, eps, scores_df, instances_df, clusters_df):
    score = round(scores_df.loc[m][eps], 4)
    instances = instances_df.loc[m][eps]
    clusters = clusters_df.loc[m][eps]
    print(f'DBSCAN using parameters m={m} and eps={eps} yields the next clustering results:')
    print()
    print(f'- Sil. score: {score}')
    print(f'- {instances} clustered instances into {clusters} clusters')
    print(f'- Avg. of {round(instances/clusters, 2)} instances per cluster')

### Color Histograms with Euclidean Distance

In [21]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_euclid_df, range(2, 10), range(4600, 7000, 200))
dfs

Epsilon,4600,4800,5000,5200,5400,5600,5800,6000,6200,6400,6600,6800
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,,0.57864,0.567089,0.564217,0.562485,0.554438,0.549932,0.544382,0.540913,0.505195,0.336549,0.329418
3,,,,,,,,,,0.505195,0.505195,0.498217
4,,,,,,,,,,,,
5,,,,,,,,,,,,
6,,,,,,,,,,,,
7,,,,,,,,,,,,
8,,,,,,,,,,,,
9,,,,,,,,,,,,


In [17]:
print_results(2, 4800, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=4800 yields the next clustering results:

- Sil. score: 0.5786
- 150 clustered instances into 2 clusters
- Avg. of 75.0 instances per cluster


In [22]:
dbscan_best_params.append([2, 4800, 'color_hist', False, 'euclid', 0.5786, 2, 150])

### Latent Features with Cosine Distance

In [25]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_cosine_df, range(2, 10), np.arange(0.05, 0.6, 0.05))
dfs

Epsilon,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,,,0.456966,0.411263,0.34772,,,,,,
3,,,0.456966,,,,,,,,
4,,,,,,,,,,,
5,,,,,,,,,,,
6,,,,,,,,,,,
7,,,,,,,,,,,
8,,,,,,,,,,,
9,,,,,,,,,,,


In [28]:
dfc

Epsilon,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,9,5,2,2,3,1,1,1,1,1,1
3,4,2,2,1,1,1,1,1,1,1,1
4,1,1,2,1,1,1,1,1,1,1,1
5,0,1,1,1,1,1,1,1,1,1,1
6,0,1,1,1,1,1,1,1,1,1,1
7,0,1,1,1,1,1,1,1,1,1,1
8,0,1,1,1,1,1,1,1,1,1,1
9,0,1,1,1,1,1,1,1,1,1,1


In [29]:
dfi

Epsilon,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,25,90,139,169,178,187,189,190,195,197,198
3,15,84,139,167,174,187,189,190,195,197,198
4,5,77,135,165,174,187,189,190,195,197,198
5,0,76,130,161,173,186,189,190,195,197,198
6,0,70,128,160,172,186,188,190,195,197,198
7,0,67,127,157,170,186,188,190,195,197,198
8,0,61,122,157,170,184,188,190,195,197,198
9,0,60,120,157,170,182,188,190,195,197,198


In [30]:
dbscan_best_params.append([3, 0.15, 'color_hist', False, 'cosine', 0.4569, 2, 139])

## Best Results

In [31]:
df = pd.DataFrame(dbscan_best_params,
                  columns=['m', 'e', 'data', 'scaled', 'similarity', 'sscore', 'clusters', 'instances'])
df

Unnamed: 0,m,e,data,scaled,similarity,sscore,clusters,instances
0,2,4800.0,color_hist,False,euclid,0.5786,2,150
1,3,0.15,color_hist,False,cosine,0.4569,2,139


In [32]:
BEST_PARAMS_FILE_PATH = os.path.join('..', 'results', 'best_color_hist_params.csv')
df.to_csv(BEST_PARAMS_FILE_PATH, index=False)