In [1]:
# This is the parameters cell
output_paths = './Output/quickdraw-pca_s4.csv'

In [2]:
import os
os.chdir('..')
os.getcwd()

'/home/eduardo/PhD/Projections/dynamic-projections'

In [3]:
output_paths = output_paths.split(' ')
dataset_id = output_paths[0].split('/')[-1].split('-')[0]
print(output_paths, dataset_id)

['./Output/quickdraw-pca_s4.csv'] quickdraw


In [4]:
import pandas as pd
import numpy as np
import math
import cv2
import re
import glob
from natsort import natsorted
from tqdm import tqdm

In [26]:
def get_md_dists(dataset_path):
    df = pd.read_csv(dataset_path, index_col=0)
    dists = []
    for poly in tqdm(df.values.reshape(len(df), -1, 2)):
        dists_i = []
        for i in range(len(poly)-2):
            dists_i.append(math.sqrt(np.sum(np.square(poly[i] - poly[i+1]))))
        dists.append(np.array(dists_i))
    return np.array(dists), df.index, len(poly)

# get_md_dists(output_ids[0])

In [27]:
image_datasets = ['quickdraw', 'fashion']

def image_dataset_to_array(dataset_path):
    # Convert image to np array
    # Preload images to memory (trying to speed things up)
    all_files = glob.glob('{}*'.format(dataset_path))
    # Gather ids and timestep info    
    max_t = {}
    for f in all_files:
        regex = r".*/{}/(.*-.*)-(.*).png".format(dataset_id)
        match = re.match(regex, f)
        img_id, t = match.groups()
        t = int(t)
        max_t[img_id] = max_t[img_id] if img_id in max_t and max_t[img_id] > t else t   
    
    img_size = 28 * 28  # Pixel count
    n_revisions = max(max_t.values())
    n_items = len(max_t.values())
    vs = np.empty((n_revisions, n_items, img_size))
    
    # Populate vs
    for i, img_id in enumerate(natsorted(max_t)):
        # Copy existing bitmaps to np.array
        for t in range(0, max_t[img_id]):
            img_file = dataset_path + img_id + '-' + str(t) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
        # Replicate last image
        for t in range(max_t[img_id], n_revisions):
            img_file = dataset_path + img_id + '-' + str(max_t[img_id]-1) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
    
    return vs, list(natsorted(max_t)), n_revisions


def tabular_dataset_to_array(dataset_path):
    # Get files with coords and save in an array vs
    all_files = natsorted(glob.glob('{}*'.format(dataset_path)))
    vs = [pd.read_csv(f, index_col=0).values for f in all_files] 
    # Get dataset info 
    df_temp = pd.read_csv(all_files[0], index_col=0)
    n_timesteps = len(all_files)
    return np.array(vs), list(df_temp.index), n_timesteps


def get_nd_dists(dataset_id):
    dists = []
    dataset_path = './Datasets/' + dataset_id + '/'
    # Get the nd data into arrays
    if dataset_id in image_datasets:
        vs, indexes, n_timesteps = image_dataset_to_array(dataset_path)
    else:
        vs, indexes, n_timesteps = tabular_dataset_to_array(dataset_path)
    # Compute dists between 2 nd arrays
    for t in tqdm(range(n_timesteps - 1)):
        v_t = vs[t]
        v_tp1 = vs[t+1]
        dists_t = []
        for a, b in zip(v_t, v_tp1):
            dists_t.append(math.sqrt(np.sum(np.square(a - b))))
        dists.append(np.array(dists_t)) 
    return np.array(dists).T, indexes, n_timesteps

# dists, indexes, n_timesteps = get_nd_dists('quickdraw')

In [28]:
# Compute distances
for output in output_paths:
    dists_nd, indexes, n_timesteps = get_nd_dists(dataset_id)
    dists_md, _, _ = get_md_dists(output)

100%|██████████| 87/87 [00:00<00:00, 146.73it/s]
100%|██████████| 600/600 [00:00<00:00, 1098.24it/s]


In [29]:
print(dists_nd.shape)
print(dists_md.shape)

(600, 87)
(600, 87)


In [34]:
metric_ids = ['stab_pearson', 'stab_spearman', 'stab_kendall', 'stab_kl', 'stab_stress_n', 'stab_stress_s']
metric_results = pd.DataFrame(np.zeros((len(output_paths), len(metric_ids))),
                              index=output_paths, columns=metric_ids)
metric_results = metric_results.reindex(sorted(metric_results.columns), axis=1)
metric_results


Unnamed: 0,stab_kendall,stab_kl,stab_pearson,stab_spearman,stab_stress_n,stab_stress_s
./Output/quickdraw-pca_s4.csv,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# metric_results.loc[output_paths[0], metric_ids[0]] = 10
# metric_results

Unnamed: 0,stab_kendall,stab_kl,stab_pearson,stab_spearman,stab_stress_n,stab_stress_s
./Output/quickdraw-pca_s4.csv,0.0,0.0,10.0,0.0,0.0,0.0
