# Clustering using Images with SSIM

## Preclustering

### Libraries and Paths

In [87]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [88]:
import sklearn

In [89]:
SSIMMX_FILE_PATH = os.path.join('..', 'matrices', 'ssim_matrix.csv')

### Loading Data

In [90]:
data = pd.read_csv(SSIMMX_FILE_PATH, index_col=0)

In [91]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 1222__pool_table__0.9999995.jpg to 691__cheetah__0.99999213.jpg
Columns: 198 entries, 1222__pool_table__0.9999995.jpg to 691__cheetah__0.99999213.jpg
dtypes: float64(198)
memory usage: 307.8+ KB


In [92]:
data.head(5)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg,...,2417881__zebra__0.9999945.jpg,2417938__banana__0.9999944.jpg,4099__pool_table__0.9999945.jpg,4339__manhole_cover__0.99999416.jpg,4534__viaduct__0.9999877.jpg,4573__barrel__0.9999974.jpg,4673__triumphal_arch__0.9999893.jpg,576__gondola__0.9999993.jpg,577__gondola__0.9999962.jpg,691__cheetah__0.99999213.jpg
1222__pool_table__0.9999995.jpg,1.0,0.087428,0.08782,0.081145,0.145815,0.096092,0.066702,0.10135,0.080253,0.059137,...,0.049728,0.119382,0.104921,0.048117,0.146993,0.096447,0.118272,0.090937,0.084306,0.09883
1328__coil__0.99999607.jpg,0.087428,1.0,0.058606,0.050433,0.082048,0.059261,0.04018,0.06515,0.036844,0.039916,...,0.030346,0.053888,0.065849,0.03114,0.063674,0.052264,0.051138,0.057475,0.047832,0.059281
134__zebra__0.9999949.jpg,0.08782,0.058606,1.0,0.088,0.171845,0.117098,0.104727,0.057158,0.049934,0.08046,...,0.067406,0.101161,0.209282,0.10047,0.1895,0.072776,0.138135,0.130018,0.116317,0.150079
2377471__pizza__0.9999988.jpg,0.081145,0.050433,0.088,1.0,0.147901,0.068128,0.084978,0.051686,0.054412,0.060246,...,0.059747,0.076995,0.098613,0.047133,0.126652,0.066246,0.08794,0.087958,0.104035,0.074231
2377620__zebra__0.9999882.jpg,0.145815,0.082048,0.171845,0.147901,1.0,0.147673,0.145933,0.06956,0.074057,0.081151,...,0.088558,0.131337,0.205603,0.077223,0.241972,0.0914,0.144071,0.15638,0.134694,0.155998


### Preprocessing the Data

In [93]:
X = data.values.copy()

In [94]:
X.shape

(198, 198)

In [95]:
invert_ssim = True
if invert_ssim:
    X = 1 - X

## DBSCAN Clustering

In [96]:
dbscan_best_params = []

In [97]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [98]:
def fit_dbscan(data, min_samples=None, eps_values=None, min_no_clusters=2, max_no_clusters=np.inf, min_clust_instances=20, max_clust_instances=np.inf, metric='euclidean'):
    scores, clusters, instances = [], [], []
    
    for m in min_samples:
        row_scores, row_clusters, row_instances = [], [], []
        for e in eps_values:
            db = DBSCAN(min_samples=m, eps=e, metric=metric).fit(data)
            # Get only non anomalous instances
            non_a = db.labels_ != -1 # [False, ..., False] if all are outliers
            # Calculate conditions
            n_clusters = len(np.unique(db.labels_[non_a])) # 0 if all are outliers
            n_instances = len(db.labels_[non_a]) # 0 if all are outliers
            # Apply conditions (why does it output NaN and not None?)
            if (n_clusters >= min_no_clusters and n_clusters <= max_no_clusters and n_instances >= min_clust_instances and n_instances <= max_clust_instances):
                # Construc new distance matrix
                new_X = [[]]
                for i in range(data.shape[0]):
                    if non_a[i]: new_X.append(list(data[i][non_a]))
                new_X.pop(0)
                new_X = np.array(new_X)
                score = silhouette_score(new_X, db.labels_[non_a], metric=metric)
            else:
                score = None
            # Store results
            row_scores.append(score)
            row_clusters.append(n_clusters)
            row_instances.append(n_instances)
        # Store row results
        scores.append(row_scores)
        clusters.append(row_clusters)
        instances.append(row_instances)
    # Prepare and return values
    ms_axis = pd.Index(min_samples, name='Min_samples')
    eps_axis = pd.Index(eps_values, name='Epsilon')
    df_scores = pd.DataFrame(scores, index=ms_axis, columns=eps_axis)
    df_clusters = pd.DataFrame(clusters, index=ms_axis, columns=eps_axis)
    df_instances = pd.DataFrame(instances, index=ms_axis, columns=eps_axis)
    return df_scores, df_clusters, df_instances

### Unscaled Data (SSIM)

In [99]:
dfs, dfc, dfi = fit_dbscan(X, range(2, 21), np.arange(0.8, 1.0, 0.01), min_clust_instances=140, metric='precomputed')
dfs

Epsilon,0.80,0.81,0.82,0.83,0.84,0.85,0.86,0.87,0.88,0.89,0.90,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,,,,
10,,,,,,,,,,,,,,,,,,,,
11,,,,,,,,,,,,,,,,,,,,


In [100]:
dfc

Epsilon,0.80,0.81,0.82,0.83,0.84,0.85,0.86,0.87,0.88,0.89,0.90,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
10,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
11,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [101]:
dfi

Epsilon,0.80,0.81,0.82,0.83,0.84,0.85,0.86,0.87,0.88,0.89,0.90,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99
Min_samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
3,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
4,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
5,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
6,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
7,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
8,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
9,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
10,70,78,84,92,102,112,122,133,143,150,160,169,178,183,187,192,197,198,198,198
11,70,78,84,92,102,112,121,133,143,150,160,169,178,183,187,192,197,198,198,198


It seems like a clear group of clusters does not exist when clustering using SSIM. Although this metric is useful for querying similar images, it's maybe not a good option to be used to discover unsupervised shared criteria among images.