In [1]:
# This is the parameters cell
# projection_paths = './Output/quickdraw-pca_s4.csv'
projection_paths = './Output/gaussians-pca_s4.csv ./Output/gaussians-AE_10f_2f_20ep.csv ./Output/gaussians-dtsne_70p_0-1l.csv'

In [2]:
projection_paths = projection_paths.split(' ')
dataset_id = projection_paths[0].split('/')[-1].split('-')[0]
print(projection_paths, dataset_id)

['./Output/gaussians-pca_s4.csv', './Output/gaussians-AE_10f_2f_20ep.csv', './Output/gaussians-dtsne_70p_0-1l.csv'] gaussians


In [3]:
import pandas as pd
import numpy as np
import scipy.stats
import math
import cv2
import re
import glob
from natsort import natsorted
from tqdm import tqdm
import os
os.chdir('..')

IMAGE_DATASETS = ['quickdraw', 'fashion']
K_VALUES = [.05, .1, .15, .2] 

# Stability metrics

In [4]:
def get_projection_as_array(dataset_path):
    df = pd.read_csv(dataset_path, index_col=0)
    vs = df.values.reshape(len(df), -1, 2)
    return vs, list(df.index), vs.shape[1]


def get_md_mov(dataset_path):
    vs, indexes, n_timesteps = get_projection_as_array(dataset_path)
    mov = []
    for poly in tqdm(vs):
        mov_i = []
        for i in range(len(poly)-1):
            mov_i.append(math.sqrt(np.sum(np.square(poly[i] - poly[i+1]))))
        mov.append(np.array(mov_i))
    return np.array(mov), indexes, n_timesteps


# get_md_mov('./Output/quickdraw-pca_s4.csv')

In [5]:
def image_dataset_to_array(dataset_path):
    # Convert image to np array
    # Preload images to memory (trying to speed things up)
    all_files = glob.glob('{}*'.format(dataset_path))
    # Gather ids and timestep info    
    max_t = {}
    for f in all_files:
        regex = r".*/{}/(.*-.*)-(.*).png".format(dataset_id)
        match = re.match(regex, f)
        img_id, t = match.groups()
        t = int(t)
        max_t[img_id] = max_t[img_id] if img_id in max_t and max_t[img_id] > t else t   
    
    img_size = 28 * 28  # Pixel count
    n_revisions = max(max_t.values()) + 1
    n_items = len(max_t.values())
    vs = np.empty((n_revisions, n_items, img_size))
    
    # Populate vs
    for i, img_id in enumerate(natsorted(max_t)):
        # Copy existing bitmaps to np.array
        for t in range(0, max_t[img_id]):
            img_file = dataset_path + img_id + '-' + str(t) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
        # Replicate last image
        for t in range(max_t[img_id], n_revisions):
            img_file = dataset_path + img_id + '-' + str(max_t[img_id]-1) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()    
    return vs, list(natsorted(max_t)), n_revisions


def tabular_dataset_to_array(dataset_path):
    # Get files with coords and save in an array vs
    all_files = natsorted(glob.glob('{}*'.format(dataset_path)))
    vs = [pd.read_csv(f, index_col=0).values for f in all_files] 
    # Get dataset info 
    df_temp = pd.read_csv(all_files[0], index_col=0)
    n_timesteps = len(all_files)
    return np.array(vs), list(df_temp.index), n_timesteps


def dataset_as_array(dataset_path):
    if dataset_id in IMAGE_DATASETS:
         return image_dataset_to_array(dataset_path)
    else:
        return tabular_dataset_to_array(dataset_path)


def get_nd_mov(dataset_id):
    mov = []
    dataset_path = './Datasets/' + dataset_id + '/'
    # Get the nd data into arrays
    vs, indexes, n_timesteps = dataset_as_array(dataset_path)
    # Compute dists between 2 nd arrays
    for t in tqdm(range(n_timesteps - 1)):
        v_t = vs[t]
        v_tp1 = vs[t+1]
        mov_t = []
        for a, b in zip(v_t, v_tp1):
            mov_t.append(math.sqrt(np.sum(np.square(a - b))))
        mov.append(np.array(mov_t)) 
    return np.array(mov).T, indexes, n_timesteps

# dists, indexes, n_timesteps = get_nd_dists('quickdraw')

In [6]:
# Compute distances
mov_nd, indexes, n_timesteps = get_nd_mov(dataset_id)
mov_md_dict = {}
for p in projection_paths:
    mov, _, _ = get_md_mov(p)
    mov_md_dict[p] = mov

100%|██████████| 9/9 [00:00<00:00, 56.41it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10942.37it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10978.46it/s]
100%|██████████| 2000/2000 [00:00<00:00, 11573.89it/s]


In [7]:
print(mov_nd.shape)
print(mov.shape)

(2000, 9)
(2000, 9)


In [8]:
metric_ids = ['stab_pearson', 'stab_spearman', 'stab_kendall', 'stab_kl', 'stab_stress_n', 'stab_stress_s',
              'spat_knn_5', 'spat_knn_10', 'spat_knn_15', 'spat_knn_20']
metric_results = pd.DataFrame(np.zeros((len(projection_paths), len(metric_ids))),
                              index=projection_paths, columns=metric_ids)
metric_results = metric_results.reindex(sorted(metric_results.columns), axis=1)
# metric_results

In [9]:
%%time

# Flatten the data
mov_nd = mov_nd.flatten()
for p in projection_paths:
    mov_md = mov_md_dict[p].flatten()

    # Correlation and divergence metrics
    metric_results.loc[p]['stab_pearson']  = scipy.stats.pearsonr(mov_nd, mov_md)[0]
    metric_results.loc[p]['stab_spearman'] = scipy.stats.spearmanr(mov_nd, mov_md)[0]
    metric_results.loc[p]['stab_kendall']  = scipy.stats.kendalltau(mov_nd, mov_md)[0]
    metric_results.loc[p]['stab_kl']       = scipy.stats.entropy(mov_nd, mov_md)

    # Stress metrics
    nd = mov_nd / max(mov_nd)
    md = mov_md / max(mov_md)
    metric_results.loc[p]['stab_stress_n'] = np.sum(np.square(nd - md)) / np.sum(np.square(nd))

    nd = (mov_nd - np.mean(mov_nd)) / np.std(mov_nd)
    md = (mov_md - np.mean(mov_md)) / np.std(mov_md)
    metric_results.loc[p]['stab_stress_s'] = np.sum(np.square(nd - md)) / np.sum(np.square(nd))

display(metric_results)

Unnamed: 0,spat_knn_10,spat_knn_15,spat_knn_20,spat_knn_5,stab_kendall,stab_kl,stab_pearson,stab_spearman,stab_stress_n,stab_stress_s
./Output/gaussians-pca_s4.csv,0.0,0.0,0.0,0.0,0.790809,0.165721,0.841275,0.945271,0.398367,0.317449
./Output/gaussians-AE_10f_2f_20ep.csv,0.0,0.0,0.0,0.0,0.716211,0.347662,0.724679,0.898326,0.631422,0.550641
./Output/gaussians-dtsne_70p_0-1l.csv,0.0,0.0,0.0,0.0,0.694352,0.23773,0.811766,0.868227,0.478215,0.376469


CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 88.7 ms


# Spatial metrics

In [67]:
from sklearn.neighbors import NearestNeighbors

dataset_path = './Datasets/' + dataset_id + '/'
vs_n, indexes, n_revisions = dataset_as_array(dataset_path)
vs_m, _, _ = get_projection_as_array(projection_paths[1])
vs_m = np.transpose(vs_m, (1,0,2))

# X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
# nbrs = NearestNeighbors(n_neighbors=, algorithm='ball_tree').fit(X)
# _, indices = nbrs.kneighbors(X)
# indices

In [68]:
print(vs_n.shape)
print(vs_m.shape)

(10, 2000, 100)
(10, 2000, 2)


In [73]:
ngbr_preservation = np.zeros((n_timesteps, len(indexes), len(K_VALUES)))
for t in tqdm(range(n_timesteps)):
    # Generate list of nearest neighbors for each item in timestep t
    _, nbrs_nd = NearestNeighbors(n_neighbors=int(max(K_VALUES)*len(indexes)), metric='euclidean',
                                  algorithm='ball_tree').fit(vs_n[t]).kneighbors(vs_n[t])   
    _, nbrs_md = NearestNeighbors(n_neighbors=int(max(K_VALUES)*len(indexes)), metric='euclidean',
                                  algorithm='kd_tree').fit(vs_m[t]).kneighbors(vs_m[t])
    
    # Compute neighbor preservation for different values of k for each item 
    for i in range(len(indexes)):
        for k_index, k_percentage in enumerate(K_VALUES):
            k = int(k_percentage * len(indexes))
            intersection = np.intersect1d(nbrs_nd[i, :k], nbrs_md[i, :k], assume_unique=True)
            ngbr_preservation[t][i][k_index] = len(intersection) / float(k)

100%|██████████| 10/10 [00:12<00:00,  1.26s/it]


In [74]:
ngbr_preservation.shape

(10, 2000, 4)

In [75]:
# Average values over TIME (axis 0)
ngbr_preservation = np.average(ngbr_preservation, axis=0)
# Then average values over all points (new axis 0)
ngbr_preservation = np.average(ngbr_preservation, axis=0)
# We get one value per k
ngbr_preservation

array([0.4549365 , 0.88130275, 0.62597917, 0.52008937])

215