In [44]:
import pandas as pd
import numpy as np
import glob
from natsort import natsorted
import cv2
import re
import numpy as np
from sklearn.decomposition import PCA

In [50]:
IMAGE_DATASETS = ['quickdraw', 'fashion']

def image_dataset_to_array(dataset_path):
    # Convert image to np array
    # Preload images to memory (trying to speed things up)
    all_files = glob.glob('{}*'.format(dataset_path))
    # Gather ids and timestep info    
    max_t = {}
    for f in all_files:
        regex = r".*/{}/(.*-.*)-(.*).png".format(dataset_id)
        match = re.match(regex, f)
        img_id, t = match.groups()
        t = int(t)
        max_t[img_id] = max_t[img_id] if img_id in max_t and max_t[img_id] > t else t   
    
    img_size = 28 * 28  # Pixel count
    n_revisions = max(max_t.values()) + 1
    n_items = len(max_t.values())
    vs = np.empty((n_revisions, n_items, img_size))
    
    # Populate vs
    for i, img_id in enumerate(natsorted(max_t)):
        # Copy existing bitmaps to np.array
        for t in range(0, max_t[img_id]):
            img_file = dataset_path + img_id + '-' + str(t) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()
        # Replicate last image
        for t in range(max_t[img_id], n_revisions):
            img_file = dataset_path + img_id + '-' + str(max_t[img_id]-1) + '.png'
            vs[t][i] = (cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) / 255.).flatten()    
    return vs, list(natsorted(max_t)), n_revisions


def tabular_dataset_to_array(dataset_path):
    # Get files with coords and save in an array vs
    all_files = natsorted(glob.glob('{}*'.format(dataset_path)))
    vs = [pd.read_csv(f, index_col=0).values for f in all_files] 
    # Get dataset info 
    df_temp = pd.read_csv(all_files[0], index_col=0)
    n_timesteps = len(all_files)
    return np.array(vs), list(df_temp.index), n_timesteps


def dataset_as_array(dataset_path):
    if dataset_id in IMAGE_DATASETS:
        return image_dataset_to_array(dataset_path)
    else:
        return tabular_dataset_to_array(dataset_path)

In [111]:
datasets = !cat datasets.txt

print('dataset_id, avg_intrinsic_dim, avg_sparsity')
for dataset_id in datasets:
    vs, indexes, _ = dataset_as_array('./' + dataset_id + '/')
    n_timesteps, n_observations, n_dimensions = vs.shape
    avg_intrinsic_dim = 0  # averaged over all timesteps
    avg_sparsity = 0  # averaged over all timesteps

    for X in vs:
        pca = PCA()
        pca.fit(X) 
        cumsum = np.cumsum(pca.explained_variance_ratio_)
        avg_intrinsic_dim += (sum(cumsum < 0.95) / n_dimensions) / n_timesteps
        avg_sparsity += np.sum(X == 0) / (X.shape[0] * X.shape[1]) / n_timesteps

    print(dataset_id, avg_intrinsic_dim, avg_sparsity)

    

dataset_id, avg_intrinsic_dim, avg_sparsity
cartolastd 0.6470588235294117 0.0
cifar10cnn 0.6599999999999997 0.0
esc50 0.03457754629629635 0.0
fashion 0.47627551020408165 0.29714502551020405
gaussians 0.36800000000000005 0.0
nnset 0.005753820735233375 0.00012391573729863693
qtables 0.007750000000000007 0.0007428240740740739
quickdraw 0.4309504700756699 0.9013996455323707
sorts 0.35051020408163236 0.010068877551020409
walk 0.47839999999999994 0.00019999999999999998
