This code ingests:

- A `.npy` file of images or features to embed.
- Optionally, a `.npy` file of corresponding labels/classifications for the images or features.

And outputs:

- A set of point clouds in `.npy` files, along with `colors.npy` based on the standardization approach [here](https://github.com/kylemcdonald/Coloring-t-SNE), and plots of the point clouds in a `plots/` folder.

One way to make this better is to run UMAP with the standard parameters initially, and then to use that embedding as an initialization step to the remaining runs. This will encourage the other embeddings to be close in terms of interpolation distance, and might even speed up the optimization process.

In [23]:
import umap
import numpy as np
import matplotlib.pyplot as plt
from utils.progress import *
from utils.rainbow import *
from time import time
import os

def format_time(seconds):
    minutes = int(seconds / 60)
    seconds = seconds - 60 * minutes
    if minutes > 0:
        return f'{minutes}min {int(seconds)}s'
    elif seconds > 0.01:
        return f'{seconds:2.2f}s'
    else:
        return f'{seconds}s'
    
def plot_tsne(xy, colors=None, alpha=0.25, figsize=(6,6), s=0.5, cmap='hsv', filename=None):
    plt.figure(figsize=figsize, facecolor='white')
    plt.margins(0)
    plt.axis('off')
    fig = plt.scatter(xy[:,0], xy[:,1],
                c=colors, # set colors of markers
                cmap=cmap, # set color map of markers
                alpha=alpha, # set alpha of markers
                marker=',', # use smallest available marker (square)
                s=s, # set marker size. single pixel is 0.5 on retina, 1.0 otherwise
                lw=0, # don't use edges
                edgecolor='') # don't use edges
    # remove all axes and whitespace / borders
    fig.axes.get_xaxis().set_visible(False)
    fig.axes.get_yaxis().set_visible(False)
    if filename is None:
        plt.show()
    else:
        os.makedirs(os.path.split(filename)[0], exist_ok=True) 
        plt.savefig(filename)
    
def standardize(data):
    out = np.copy(data).astype(np.float32)
    out -= out.mean(axis=0)
    std = out.std(axis=0)
    std[np.where(std == 0)] = 1
    out /= std
    return out

def job(task):
    data, argmax, output_dir, n_neighbors, min_dist, y, target_metric = task
    config = f'{min_dist:.3f}_{n_neighbors:02d}'
    start = time()
    if y is None:
        embedder = umap.UMAP(min_dist=min_dist, n_neighbors=n_neighbors)
        embedding = embedder.fit_transform(data)
    else:
        embedder = umap.UMAP(min_dist=min_dist, n_neighbors=n_neighbors, target_metric=target_metric)
        embedding = embedder.fit_transform(data, y)
    duration = time() - start
    npy_path = os.path.join(output_dir, config + '.npy')
    np.save(npy_path, embedding)
    png_dir = os.path.join(output_dir, 'plots')
    os.makedirs(png_dir, exist_ok=True)
    png_path = os.path.join(png_dir, config + '.png')
    plot_tsne(embedding, argmax, filename=png_path)
    return f'{config}: {format_time(duration)}'

def create_embedding(input_fn, output_dir,
                     n_neighbors_opt=[2,3,5],
                     min_dist_opt=[0.001, 0.01, 0.1],
                     supervision_fn=None, target_metric='categorical'):
    data = np.load(input_fn)
    data = data.reshape(len(data), -1)
    
    y = None
    if supervision_fn is not None:
        y = np.load(supervision_fn)
    
    argmax = np.argmax(standardize(data), axis=1)
    colors = to_rainbow(argmax)
    os.makedirs(output_dir, exist_ok=True)
    np.save(os.path.join(output_dir, 'colors.npy'), colors)
    
    tasks = []
    for n_neighbors in n_neighbors_opt:
        for min_dist in min_dist_opt:
            tasks.append((data, argmax, output_dir, n_neighbors, min_dist, y, target_metric))

    timing = progress_parallel(job, tasks)
    print(input_fn, output_dir)
    print(timing)

In [None]:
# depth images
create_embedding(
    '../data/depth/npy32/images.npy',
    '../data/embeddings/depth')

# saliency images
create_embedding(
    '../data/saliency/npy32/images.npy',
    '../data/embeddings/saliency')

# openface
create_embedding(
    '../data/openface/npy32/descriptors.npy',
    '../data/embeddings/openface')
create_embedding(
    '../data/openface/npy32-ellipses/images.npy',
    '../data/embeddings/openface-ellipses')

# detectron
create_embedding(
    '../data/detectron/npy32/images.npy',
    '../data/embeddings/detectron')
create_embedding(
    '../data/detectron/npy32/images.npy',
    '../data/embeddings/detectron-supervised',
    supervision_fn='data/analysis/face_counts/categories.npy')

# vgg features
create_embedding(
    '../data/dcnn/vgg/features_canonical.npy',
    '../data/embeddings/vgg-features')
create_embedding(
    '../data/dcnn/vgg/features_canonical.npy',
    '../data/embeddings/vgg-features-supervised',
    supervision_fn='data/analysis/face_counts/categories.npy')

# inception features
create_embedding(
    '../data/dcnn/inceptionv3/features_canonical.npy',
    '../data/embeddings/inceptionv3-features')
create_embedding(
    '../data/dcnn/inceptionv3/features_canonical.npy',
    '../data/embeddings/inceptionv3-features-supervised',
    supervision_fn='../data/analysis/face_counts/categories.npy')

# inception predictions
create_embedding(
    '../data/dcnn/inceptionv3/predictions_canonical.npy',
    '../data/embeddings/inceptionv3-predictions')
create_embedding(
    '../data/dcnn/inceptionv3/predictions_canonical.npy',
    '../data/embeddings/inceptionv3-predictions-supervised',
    supervision_fn='../data/analysis/face_counts/categories.npy')