# Clustering using Latent Features (from InceptionV1)

In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [17]:
import sklearn

In [18]:
# Loading IncV1 latent features
SIM_MX_EUCLID_FILE_PATH = os.path.join('..', 'results', 'matrices', 'incv1_feats_euclid_sim_matrix.csv')
SIM_MX_COSINE_FILE_PATH = os.path.join('..', 'results', 'matrices', 'incv1_feats_cosine_sim_matrix.csv')
VOTES_FILE_PATH = os.path.join('..', 'results', 'votes_summary.csv')

### Loading Data

#### Similarity Matrices

In [19]:
sim_mx_euclid_df = pd.read_csv(SIM_MX_EUCLID_FILE_PATH, index_col=0)
sim_mx_euclid_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,18.077932,18.35201,19.313205,17.938015,20.897371,18.154289,19.302859,27.004215,17.51358,...,17.500137,18.677966,10.767253,18.312229,16.895729,18.79902,17.765271,19.112316,18.73503,16.739524
1328__coil__0.99999607.jpg,18.077932,0.0,16.759492,16.995887,15.510985,19.676206,16.044883,19.226313,27.071347,15.62919,...,15.498601,18.709625,19.031383,15.724877,15.621419,17.951774,15.027124,18.835509,17.194183,16.162395
134__zebra__0.9999949.jpg,18.35201,16.759492,0.0,17.076727,8.243228,8.670629,7.400814,18.219061,26.279913,5.943151,...,6.839282,17.21812,19.2221,16.306863,14.891362,17.448277,15.079868,18.65561,17.145698,13.605314


In [20]:
sim_mx_cosine_df = pd.read_csv(SIM_MX_COSINE_FILE_PATH, index_col=0)
sim_mx_cosine_df.head(3)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,0.0,0.596715,0.64871,0.660728,0.678895,0.655366,0.639423,0.564146,0.664004,0.623505,...,0.630923,0.551253,0.173044,0.609761,0.597685,0.598938,0.641557,0.532241,0.55112,0.529807
1328__coil__0.99999607.jpg,0.596715,0.0,0.676294,0.619278,0.657653,0.670955,0.62671,0.647045,0.720932,0.639172,...,0.640558,0.6466,0.625023,0.550529,0.677668,0.652152,0.587317,0.589918,0.539141,0.620833
134__zebra__0.9999949.jpg,0.64871,0.676294,0.0,0.668483,0.195514,0.10125,0.144694,0.606948,0.684568,0.09684,...,0.130526,0.573603,0.670446,0.635405,0.681183,0.654673,0.649893,0.604198,0.563096,0.476029


#### Votes

In [21]:
votes_df = pd.read_csv(VOTES_FILE_PATH, index_col=0)
votes_df.head(3)

Unnamed: 0,ig,lime,xrai,anchor,best
1222__pool_table__0.9999995.jpg,12,13,3,1,lime
1328__coil__0.99999607.jpg,17,4,3,2,ig
134__zebra__0.9999949.jpg,14,1,8,2,ig


## DBSCAN Clustering

In [22]:
dbscan_best_params = []

In [23]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [24]:
def get_sim_mx_subset(sim_mx_values, filter_idxs):
    return sim_mx_values.take(filter_idxs, axis=0).take(filter_idxs, axis=1)

In [25]:
def fit_dbscan_sim_mx(data, min_samples, eps_values, 
               min_no_clusters=2, max_no_clusters=np.inf,
               min_clust_instances=None, min_clust_instances_pct=0.7,
               max_clust_instances=np.inf):
    # Condition precalculation
    if min_clust_instances_pct: # If % was defined
        min_clust_instances = round(data.shape[0] * min_clust_instances_pct)
    elif not min_clust_instances: # Else, if nominal amount was not specified
        min_clust_instances = 100
    # Code
    scores, clusters, instances = [], [], []
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric='precomputed').fit(data)
            # Get only non anomalous instances and indices
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            non_a_idxs = np.argwhere(non_a==True)
            non_a_idxs = non_a_idxs.reshape(non_a_idxs.shape[0])
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions (why does it output NaN and not None?)
            valid_n_clusters = n_clusters >= min_no_clusters and n_clusters <= max_no_clusters
            valid_n_cl_instances = n_instances >= min_clust_instances and n_instances <= max_clust_instances
            if (valid_n_clusters and valid_n_cl_instances):
                non_a_data = get_sim_mx_subset(data, non_a_idxs)
                score = silhouette_score(non_a_data, db.labels_[non_a], metric='precomputed')
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

In [26]:
def print_results(m, eps, scores_df, instances_df, clusters_df):
    score = round(scores_df.loc[m][eps], 4)
    instances = instances_df.loc[m][eps]
    clusters = clusters_df.loc[m][eps]
    print(f'DBSCAN using parameters m={m} and eps={eps} yields the next clustering results:')
    print()
    print(f'- Sil. score: {score}')
    print(f'- {instances} clustered instances into {clusters} clusters')
    print(f'- Avg. of {round(instances/clusters, 2)} instances per cluster')

### Latent Features with Euclidean Distance

In [32]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_euclid_df, range(2, 6), range(10, 18))
dfs

Epsilon,10,11,12,13,14,15,16,17
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,,0.442674,0.402843,0.322418,0.266275,0.221652,0.169493,0.246924
3,,,0.412054,0.330544,0.260876,0.255033,,
4,,,0.440103,0.370082,0.272411,0.255033,,
5,,,,0.372159,0.272411,0.257415,,


In [36]:
print_results(2, 11, dfs, dfi, dfc)

DBSCAN using parameters m=2 and eps=11 yields the next clustering results:

- Sil. score: 0.4427
- 150 clustered instances into 17 clusters
- Avg. of 8.82 instances per cluster


In [41]:
dbscan_best_params.append([2, 11, 'latent_v1', False, 'euclid', 0.4426, 17, 150])

### Latent Features with Cosine Distance

In [38]:
dfs, dfc, dfi = fit_dbscan_sim_mx(sim_mx_cosine_df, range(2, 10), np.arange(0.15, 0.6, 0.05))
dfs

Epsilon,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,,0.626604,0.641497,0.646465,0.607523,0.557056,0.194113,0.187619,
3,,0.680734,0.641185,0.647261,0.610459,0.57958,0.18353,,
4,,,0.643609,0.653184,0.622712,0.582565,0.388715,,
5,,,0.668864,0.653184,0.622712,0.591958,0.391121,,
6,,,0.665297,0.653827,0.624456,0.592189,0.396046,,
7,,,0.668657,0.659159,0.626595,0.592189,0.396046,,
8,,,,,0.626595,0.60374,0.396046,,
9,,,,,,,0.396046,,


In [39]:
print_results(3, 0.2, dfs, dfi, dfc)

DBSCAN using parameters m=3 and eps=0.2 yields the next clustering results:

- Sil. score: 0.6807
- 144 clustered instances into 11 clusters
- Avg. of 13.09 instances per cluster


In [42]:
dbscan_best_params.append([3, 0.2, 'latent_v1', False, 'cosine', 0.6807, 11, 144])

## Best Results

In [43]:
df = pd.DataFrame(dbscan_best_params,
                  columns=['m', 'e', 'data', 'scaled', 'similarity', 'sscore', 'clusters', 'instances'])
df

Unnamed: 0,m,e,data,scaled,similarity,sscore,clusters,instances
0,2,11.0,latent_v1,False,euclid,0.4426,17,150
1,3,0.2,latent_v1,False,cosine,0.6807,11,144


In [48]:
BEST_PARAMS_FILE_PATH = os.path.join('..', 'results', 'best_incv1_params.csv')
df.to_csv(BEST_PARAMS_FILE_PATH, index=False)