#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=''

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=''


In [3]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [4]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [5]:
from tqdm.autonotebook import tqdm

In [6]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [7]:
output_dir = MODEL_DIR/'projections' 

In [8]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed'])

### cassins

In [9]:
dataset = 'cassins_dtw'
dims = (32,31,1)

##### load dataset

In [10]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

syllable_df = pd.read_pickle(DATA_DIR/'cassins'/ 'cassins.pickle')

top_labels = (
    pd.DataFrame(
        {i: [np.sum(syllable_df.labels.values == i)] for i in syllable_df.labels.unique()}
    )
    .T.sort_values(by=0, ascending=False)[:20]
    .T
)

sylllable_df = syllable_df[syllable_df.labels.isin(top_labels.columns)]


sylllable_df = sylllable_df.reset_index()

specs = np.array(list(sylllable_df.spectrogram.values))
specs.shape

sylllable_df['subset'] = 'train'
sylllable_df.loc[:1000, 'subset'] = 'valid'
sylllable_df.loc[1000:1999, 'subset'] = 'test'


Y_train = np.array(list(sylllable_df.labels.values[sylllable_df.subset == 'train']))
Y_valid = np.array(list(sylllable_df.labels.values[sylllable_df.subset == 'valid']))
Y_test = np.array(list(sylllable_df.labels.values[sylllable_df.subset == 'test']))

X_train = np.array(list(sylllable_df.spectrogram.values[sylllable_df.subset == 'train'])) #/ 255.
X_valid = np.array(list(sylllable_df.spectrogram.values[sylllable_df.subset == 'valid']))# / 255.
X_test = np.array(list(sylllable_df.spectrogram.values[sylllable_df.subset == 'test'])) #/ 255.

X_train_flat = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
Y_train = enc.fit_transform([[i] for i in Y_train]).astype('int').flatten()

In [11]:
X_test_flat = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))


#### Network 

##### 2 dims

In [12]:
load_loc = output_dir / dataset / 'network' 

In [13]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [14]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [15]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.4083692089188844
seconds:  0.12363753491081297
seconds:  0.12229928700253367
seconds:  0.12011320306919515
seconds:  0.120315927779302
seconds:  0.12198338704183698
seconds:  0.12279084115289152
seconds:  0.11824482004158199
seconds:  0.1201864848844707
seconds:  0.12314805085770786



In [16]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        embedder.transform(X_test_flat);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time]

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


##### 64 dims

In [17]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [18]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [19]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [20]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.28468904201872647
seconds:  0.13192583504132926
seconds:  0.13137854798696935
seconds:  0.127000343054533
seconds:  0.12589784990996122
seconds:  0.12462793290615082
seconds:  0.12494157301262021
seconds:  0.13032097788527608
seconds:  0.12635245407000184
seconds:  0.12519181985408068



In [None]:
z = 

In [21]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        embedder.transform(X_test_flat);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time]

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


#### UMAP-learn

##### 2 dims

In [22]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Wed Jul 15 15:43:51 2020 Finding Nearest Neighbors
Wed Jul 15 15:43:51 2020 Building RP forest with 13 trees
Wed Jul 15 15:43:52 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
Wed Jul 15 15:44:01 2020 Finished Nearest Neighbor Search
Wed Jul 15 15:44:04 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Jul 15 15:44:28 2020 Finished embedding


In [23]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  15.119369002990425
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.0272782898973674
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.2802738298196346
	completed  0  /

In [24]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


##### 64 dims

In [25]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Wed Jul 15 15:45:13 2020 Finding Nearest Neighbors
Wed Jul 15 15:45:13 2020 Building RP forest with 13 trees
Wed Jul 15 15:45:14 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
Wed Jul 15 15:45:15 2020 Finished Nearest Neighbor Search
Wed Jul 15 15:45:15 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Jul 15 15:45:46 2020 Finished embedding


In [26]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.5276016800198704
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.345431795110926
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.1918126849923283
	completed  0  / 

In [27]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


#### PCA

##### 2 dims

In [28]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [29]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.0023729591630399227
seconds:  0.0023731491528451443
seconds:  0.0023201669100672007
seconds:  0.002356609096750617
seconds:  0.0036371760070323944
seconds:  0.00390038313344121
seconds:  0.0038879928179085255
seconds:  0.003882613033056259
seconds:  0.003911643056198955
seconds:  0.00393049418926239



In [30]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


##### 64 dims

In [31]:
pca = PCA(n_components=64)
z = pca.fit_transform(X_train_flat)

In [32]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.0026153959333896637
seconds:  0.002987637184560299
seconds:  0.0027679698541760445
seconds:  0.0027762099634855986
seconds:  0.0027665398083627224
seconds:  0.002775039989501238
seconds:  0.002803341019898653
seconds:  0.002764059929177165
seconds:  0.0028021507896482944
seconds:  0.002757859183475375



In [33]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
5,network,2,cassins_dtw,0.121983
6,network,2,cassins_dtw,0.122791
7,network,2,cassins_dtw,0.118245
8,network,2,cassins_dtw,0.120186
9,network,2,cassins_dtw,0.123148


#### TSNE

##### 2 dims

In [34]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [35]:
embedding_train = tsne.fit(X_train_flat)

--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...




   --> Time elapsed: 18.82 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.25 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.29 seconds
===> Running optimization with exaggeration=12.00, lr=2082.00 for 250 iterations...
Iteration   50, KL divergence 5.1131, 50 iterations in 1.2649 sec
Iteration  100, KL divergence 4.3891, 50 iterations in 1.2087 sec
Iteration  150, KL divergence 4.1591, 50 iterations in 1.2184 sec
Iteration  200, KL divergence 4.0415, 50 iterations in 1.1851 sec
Iteration  250, KL divergence 3.9681, 50 iterations in 1.1964 sec
   --> Time elapsed: 6.07 seconds
===> Running optimization with exaggeration=1.00, lr=2082.00 for 500 iterations...
Iteration   50, KL divergence 3.0996, 50 iterations in 1.1794 sec
Iteration  100, KL divergence 2.6236, 50 iterations in 1.1659 sec
Iteration  150, KL divergence 2.3695, 50 iterations in 1.4955 sec
Iteration  200, KL divergence 2.2127, 50 iterations in 2.1753 sec
Iteration  250, KL di

In [36]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.53 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 17007.8672, 50 iterations in 0.0819 sec
Iteration  100, KL divergence 17032.7141, 50 iterations in 0.0657 sec
Iteration  150, KL divergence 17051.8500, 50 iterations in 0.0620 sec
Iteration  200, KL divergence 17064.0905, 50 iterations in 0.0807 sec
Iteration  250, KL divergence 17077.0027, 50 iterations in 0.0615 sec
   --> Time elapsed: 0.35 seconds
seconds:  1.1731005650945008
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.59 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Ru

In [37]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed
0,network,2,cassins_dtw,0.408369
1,network,2,cassins_dtw,0.123638
2,network,2,cassins_dtw,0.122299
3,network,2,cassins_dtw,0.120113
4,network,2,cassins_dtw,0.120316
...,...,...,...,...
65,TSNE,2,cassins_dtw,0.900246
66,TSNE,2,cassins_dtw,0.983221
67,TSNE,2,cassins_dtw,0.894863
68,TSNE,2,cassins_dtw,1.001269


### Save

In [38]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)