# Clustering using Latent Features (from InceptionV3)

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
import sklearn

In [3]:
# Loading IncV1 latent features
SIM_MX_EUCLID_FILE_PATH = os.path.join('..', 'results', 'matrices', 'incv3_feats_euclid_sim_matrix.csv')
SIM_MX_COSINE_FILE_PATH = os.path.join('..', 'results', 'matrices', 'incv3_feats_cosine_sim_matrix.csv')
VOTES_FILE_PATH = os.path.join('..', 'results', 'votes_summary.csv')

### Loading Data

#### Similarity Matrices

In [4]:
sim_mx_euclid_df = pd.read_csv(SIM_MX_EUCLID_FILE_PATH, index_col=0)
sim_mx_euclid_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,24.89917,22.871903,25.031346,23.751015,25.26349,23.017009,23.726696,26.78418,23.866859,...,23.893972,28.554533,8.689397,23.648353,23.595179,24.177,23.629427,26.545261,27.091866,23.200351
1328__coil__0.99999607.jpg,24.89917,0.0,17.500986,18.181155,17.596692,19.09994,17.112792,17.488145,20.532969,17.789077,...,17.571292,23.570307,21.901239,16.987964,17.437562,19.139305,17.123877,19.577529,20.261407,16.637508
134__zebra__0.9999949.jpg,22.871903,17.500986,0.0,17.436307,7.257223,7.579433,6.056684,16.007572,19.907645,6.56977,...,5.660697,22.369222,19.397503,15.556074,16.074305,16.874761,16.384309,17.144394,17.840449,14.699963


In [5]:
sim_mx_cosine_df = pd.read_csv(SIM_MX_COSINE_FILE_PATH, index_col=0)
sim_mx_cosine_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,0.766186,0.681436,0.771736,0.731295,0.778913,0.706014,0.751689,0.810367,0.738298,...,0.753613,0.762703,0.06842,0.76917,0.747195,0.747668,0.744787,0.828638,0.826484,0.718752
1328__coil__0.99999607.jpg,0.766186,0.0,0.655775,0.620482,0.643737,0.673256,0.650034,0.668247,0.682593,0.656279,...,0.659462,0.672215,0.75713,0.66209,0.672293,0.739528,0.640632,0.66133,0.660451,0.61215
134__zebra__0.9999949.jpg,0.681436,0.655775,0.0,0.645445,0.126298,0.108648,0.095005,0.656602,0.708973,0.102951,...,0.079798,0.637221,0.648792,0.660485,0.67203,0.660639,0.688565,0.560902,0.557332,0.563768


#### Votes

In [6]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


## DBSCAN Clustering

In [7]:
dbscan_best_params = []

In [8]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [9]:
def get_sim_mx_subset(sim_mx_values, filter_idxs):
    return sim_mx_values.take(filter_idxs, axis=0).take(filter_idxs, axis=1)

In [10]:
def fit_dbscan_sim_mx(data, min_samples, eps_values, 
               min_no_clusters=2, max_no_clusters=np.inf,
               min_clust_instances=None, min_clust_instances_pct=0.7,
               max_clust_instances=np.inf):
    # Condition precalculation
    if min_clust_instances_pct: # If % was defined
        min_clust_instances = round(data.shape[0] * min_clust_instances_pct)
    elif not min_clust_instances: # Else, if nominal amount was not specified
        min_clust_instances = 100
    # Code
    scores, clusters, instances = [], [], []
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric='precomputed').fit(data)
            # Get only non anomalous instances and indices
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            non_a_idxs = np.argwhere(non_a==True)
            non_a_idxs = non_a_idxs.reshape(non_a_idxs.shape[0])
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions (why does it output NaN and not None?)
            valid_n_clusters = n_clusters >= min_no_clusters and n_clusters <= max_no_clusters
            valid_n_cl_instances = n_instances >= min_clust_instances and n_instances <= max_clust_instances
            if (valid_n_clusters and valid_n_cl_instances):
                non_a_data = get_sim_mx_subset(data, non_a_idxs)
                score = silhouette_score(non_a_data, db.labels_[non_a], metric='precomputed')
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

In [11]:
def print_results(m, eps, scores_df, instances_df, clusters_df):
    score = round(scores_df.loc[m][eps], 4)
    instances = instances_df.loc[m][eps]
    clusters = clusters_df.loc[m][eps]
    print(f'DBSCAN using parameters m={m} and eps={eps} yields the next clustering results:')
    print()
    print(f'- Sil. score: {score}')
    print(f'- {instances} clustered instances into {clusters} clusters')
    print(f'- Avg. of {round(instances/clusters, 2)} instances per cluster')

### Latent Features with Euclidean Distance

In [14]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_euclid_df, range(2, 10), range(8, 18))
dfs

Epsilon,8,9,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,,0.519403,0.539334,0.530887,0.523888,0.502596,0.404713,0.353062,0.340711,0.293263
3,,,0.539149,0.530298,0.522983,0.500947,0.400531,0.348019,0.337206,0.344206
4,,,0.557516,0.547405,0.539833,0.508479,0.416774,0.352309,0.416545,0.344206
5,,,,0.549667,0.541827,0.536531,0.416774,0.355074,0.416545,0.344206
6,,,,0.568107,0.561641,0.561641,0.431184,0.355074,0.416545,0.344206
7,,,,,,,0.419396,0.350095,0.410529,0.335699
8,,,,,,,0.432715,0.343375,0.413094,0.356833
9,,,,,,,0.427502,0.346429,0.413094,0.356833


In [15]:
print_results(6, 11, dfs, dfi, dfc)

DBSCAN using parameters m=6 and eps=11 yields the next clustering results:

- Sil. score: 0.5681
- 140 clustered instances into 7 clusters
- Avg. of 20.0 instances per cluster


In [16]:
dbscan_best_params.append([6, 11, 'latent_v3', False, 'euclid', 0.5681, 7, 140])

### Latent Features with Cosine Distance

In [18]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_cosine_df, range(2, 10), np.arange(0.05, 0.6, 0.05))
dfs

Epsilon,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,,,0.769816,0.804339,0.802406,0.802406,0.79903,0.79903,0.782073,0.729812,0.475713
3,,,0.794725,0.803397,0.801378,0.801378,0.797861,0.797861,0.780667,0.726019,0.467599
4,,,,0.815454,0.813319,0.813319,0.809536,0.809536,0.784586,0.728371,0.472592
5,,,,0.821855,0.815525,0.815525,0.809536,0.809536,0.784586,0.728371,0.472592
6,,,,0.821855,0.81955,0.81955,0.81955,0.81955,0.792786,0.732859,0.554061
7,,,,,0.815808,0.815808,0.815808,0.815808,0.788034,0.725704,0.552187
8,,,,,,,,,0.787149,0.721729,0.552187
9,,,,,,,,,,0.721729,0.542468


In [19]:
print_results(6, 0.2, dfs, dfi, dfc)

DBSCAN using parameters m=6 and eps=0.2 yields the next clustering results:

- Sil. score: 0.8219
- 150 clustered instances into 8 clusters
- Avg. of 18.75 instances per cluster


In [21]:
dbscan_best_params.append([6, 0.2, 'latent_v3', False, 'cosine', 0.8219, 8, 150])

## Best Results

In [22]:
df = pd.DataFrame(dbscan_best_params,
                  columns=['m', 'e', 'data', 'scaled', 'similarity', 'sscore', 'clusters', 'instances'])
df

Unnamed: 0,m,e,data,scaled,similarity,sscore,clusters,instances
0,6,11.0,latent_v3,False,euclid,0.5681,7,140
1,6,0.2,latent_v3,False,cosine,0.8219,8,150


In [23]:
BEST_PARAMS_FILE_PATH = os.path.join('..', 'results', 'best_incv3_params.csv')
df.to_csv(BEST_PARAMS_FILE_PATH, index=False)