In [1]:
# This is the parameters cell
# projection_paths = './Output/quickdraw-pca_s4.csv'
projection_paths = './Output/gaussians-pca_s4.csv ./Output/gaussians-AE_10f_2f_20ep.csv ./Output/gaussians-dtsne_70p_0-1l.csv'

In [2]:
projection_paths = projection_paths.split(' ')
dataset_id = projection_paths[0].split('/')[-1].split('-')[0]
print(projection_paths, dataset_id)

['./Output/gaussians-pca_s4.csv', './Output/gaussians-AE_10f_2f_20ep.csv', './Output/gaussians-dtsne_70p_0-1l.csv'] gaussians


In [3]:
import pandas as pd
import numpy as np
import scipy.stats
import math
import cv2
import re
import glob
from natsort import natsorted
from tqdm import tqdm
import os
os.chdir('..')

IMAGE_DATASETS = ['quickdraw', 'fashion']

# Stability metrics

In [4]:
def get_md_dists(dataset_path):
    df = pd.read_csv(dataset_path, index_col=0)
    dists = []
    for poly in tqdm(df.values.reshape(len(df), -1, 2)):
        dists_i = []
        for i in range(len(poly)-1):
            dists_i.append(math.sqrt(np.sum(np.square(poly[i] - poly[i+1]))))
        dists.append(np.array(dists_i))
    return np.array(dists), df.index, len(poly)

get_md_dists('./Output/quickdraw-pca_s4.csv')[0].shape

100%|██████████| 600/600 [00:00<00:00, 1231.06it/s]


(600, 88)

In [5]:
def image_dataset_to_array(dataset_path):
    # Convert image to np array
    # Preload images to memory (trying to speed things up)
    all_files = glob.glob('{}*'.format(dataset_path))
    # Gather ids and timestep info    
    max_t = {}
    for f in all_files:
        regex = r".*/{}/(.*-.*)-(.*).png".format(dataset_id)
        match = re.match(regex, f)
        img_id, t = match.groups()
        t = int(t)
        max_t[img_id] = max_t[img_id] if img_id in max_t and max_t[img_id] > t else t   
    
    img_size = 28 * 28  # Pixel count
    n_revisions = max(max_t.values()) + 1
    n_items = len(max_t.values())
    vs = np.empty((n_revisions, n_items, img_size))
    
    # Populate vs
    for i, img_id in enumerate(natsorted(max_t)):
        # Copy existing bitmaps to np.array
        for t in range(0, max_t[img_id]):
            img_file = dataset_path + img_id + '-' + str(t) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
        # Replicate last image
        for t in range(max_t[img_id], n_revisions):
            img_file = dataset_path + img_id + '-' + str(max_t[img_id]-1) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
    
    return vs, list(natsorted(max_t)), n_revisions


def tabular_dataset_to_array(dataset_path):
    # Get files with coords and save in an array vs
    all_files = natsorted(glob.glob('{}*'.format(dataset_path)))
    vs = [pd.read_csv(f, index_col=0).values for f in all_files] 
    # Get dataset info 
    df_temp = pd.read_csv(all_files[0], index_col=0)
    n_timesteps = len(all_files)
    return np.array(vs), list(df_temp.index), n_timesteps


def get_nd_dists(dataset_id):
    dists = []
    dataset_path = './Datasets/' + dataset_id + '/'
    # Get the nd data into arrays
    if dataset_id in IMAGE_DATASETS:
        vs, indexes, n_timesteps = image_dataset_to_array(dataset_path)
    else:
        vs, indexes, n_timesteps = tabular_dataset_to_array(dataset_path)
    # Compute dists between 2 nd arrays
    for t in tqdm(range(n_timesteps - 1)):
        v_t = vs[t]
        v_tp1 = vs[t+1]
        dists_t = []
        for a, b in zip(v_t, v_tp1):
            dists_t.append(math.sqrt(np.sum(np.square(a - b))))
        dists.append(np.array(dists_t)) 
    return np.array(dists).T, indexes, n_timesteps

# dists, indexes, n_timesteps = get_nd_dists('quickdraw')

In [6]:
# Compute distances
dists_nd, indexes, n_timesteps = get_nd_dists(dataset_id)
dists_md_dict = {}
for p in projection_paths:
    dists, _, _ = get_md_dists(p)
    dists_md_dict[p] = dists

100%|██████████| 9/9 [00:00<00:00, 63.15it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10440.51it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10012.41it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10585.38it/s]


In [7]:
print(dists_nd.shape)
print(dists.shape)

(2000, 9)
(2000, 9)


In [11]:
metric_ids = ['stab_pearson', 'stab_spearman', 'stab_kendall', 'stab_kl', 'stab_stress_n', 'stab_stress_s',
              'spat_knn_5', 'spat_knn_10', 'spat_knn_15', 'spat_knn_20']
metric_results = pd.DataFrame(np.zeros((len(projection_paths), len(metric_ids))),
                              index=projection_paths, columns=metric_ids)
metric_results = metric_results.reindex(sorted(metric_results.columns), axis=1)
# metric_results

In [12]:
%%time

# Flatten the data
dists_nd = dists_nd.flatten()
for p in projection_paths:
    dists_md = dists_md_dict[p].flatten()

    # Correlation and divergence metrics
    metric_results.loc[p]['stab_pearson']  = scipy.stats.pearsonr(dists_nd, dists_md)[0]
    metric_results.loc[p]['stab_spearman'] = scipy.stats.spearmanr(dists_nd, dists_md)[0]
    metric_results.loc[p]['stab_kendall']  = scipy.stats.kendalltau(dists_nd, dists_md)[0]
    metric_results.loc[p]['stab_kl']       = scipy.stats.entropy(dists_nd, dists_md)

    # Stress metrics
    nd = dists_nd / max(dists_nd)
    md = dists_md / max(dists_md)
    metric_results.loc[p]['stab_stress_n'] = np.sum(np.square(nd - md)) / np.sum(np.square(nd))

    nd = (dists_nd - np.mean(dists_nd)) / np.std(dists_nd)
    md = (dists_md - np.mean(dists_md)) / np.std(dists_md)
    metric_results.loc[p]['stab_stress_s'] = np.sum(np.square(nd - md)) / np.sum(np.square(nd))

display(metric_results)

Unnamed: 0,spat_knn_10,spat_knn_15,spat_knn_20,spat_knn_5,stab_kendall,stab_kl,stab_pearson,stab_spearman,stab_stress_n,stab_stress_s
./Output/gaussians-pca_s4.csv,0.0,0.0,0.0,0.0,0.790809,0.165721,0.841275,0.945271,0.398367,0.317449
./Output/gaussians-AE_10f_2f_20ep.csv,0.0,0.0,0.0,0.0,0.716211,0.347662,0.724679,0.898326,0.631422,0.550641
./Output/gaussians-dtsne_70p_0-1l.csv,0.0,0.0,0.0,0.0,0.694352,0.23773,0.811766,0.868227,0.478215,0.376469


CPU times: user 80 ms, sys: 0 ns, total: 80 ms
Wall time: 81.8 ms


# Spatial metrics

In [13]:
from sklearn.neighbors import NearestNeighbors
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
_, indices = nbrs.kneighbors(X)
indices

array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])