In [1]:
import tensorflow as tf
from IPython.display import clear_output
tf.keras.utils.set_random_seed(42)
import skdim
import time
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import f1_score

### Import data and obtain the number of labels:

In [2]:
filename = 'data/dataset3.pkl'

with open(filename, 'rb') as f:
    data_tuples = pickle.load(f)
f.close()

In [3]:
print('Number of Data Points:', len(data_tuples))
class_labels = []
for i in range(len(data_tuples)):
    class_labels.append(data_tuples[i][1])

print('Class Labels:', len(set(class_labels)))
print('Feature Count:', len(data_tuples[0][0]))

Number of Data Points: 2500
Class Labels: 5
Feature Count: 175


### Perform Estimation

In [4]:
data = pd.DataFrame([i[0] for i in data_tuples])
data = (data - np.min(data)) / (np.max(data) - np.min(data))
labels = pd.DataFrame([i[1] for i in data_tuples])

In [5]:
start_time = time.time()


#estimate local intrinsic dimension (dimension in k-nearest-neighborhoods around each point):
est = skdim.id.MOM().fit(data,
                              n_neighbors = 125,
                              n_jobs = -1,)

print("--- %s seconds ---" % (time.time() - start_time))

#get estimated intrinsic dimension
print(np.mean(est.dimension_pw_))

--- 0.20076894760131836 seconds ---
31.394195805715704


### Perform Dimensionality Reduction (Autoencoder Testing) ###

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [14]:
ds_tr = tf.data.Dataset.from_tensor_slices(X_train)
ds_te = tf.data.Dataset.from_tensor_slices(X_test)

In [15]:
def coefficient(low, id, high):
    return (high-id)/(high-low)

In [16]:
ID = 30

In [17]:
tf.keras.utils.set_random_seed(42) # extra code – ensures reproducibility on CPU

radius = 5

shape = len(data_tuples[0][0])

input = tf.keras.layers.Input(shape=(shape,))

latent_minus = tf.keras.layers.Dense(max(ID-radius, 1), activation="relu")(input)
latent_id = tf.keras.layers.Dense(ID, activation="relu")(input)
latent_plus = tf.keras.layers.Dense(ID+radius, activation="relu")(input)

output_minus = tf.keras.layers.Dense(shape)(latent_minus)
output_id = tf.keras.layers.Dense(shape)(latent_id)
output_plus = tf.keras.layers.Dense(shape)(latent_plus)

model = tf.keras.Model(inputs=[input],
                       outputs=[output_minus, output_id, output_plus])

model.compile(loss="mse", optimizer="nadam",
              metrics=["RootMeanSquaredError"])
history = model.fit(X_train, X_train, epochs=10,
                         validation_data=(X_test, X_test))

results = []

for i in history.history:
    results.append((i, history.history[i][-1]))

# 'ID - radius' validation loss
(ID-radius), results[8][1]

clear_output(wait=True)

print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(max(ID-radius, 1), results[1][1], results[8][1]))
print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(ID, results[2][1], results[9][1]))
print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(ID+radius, results[3][1], results[10][1]))

c = coefficient(results[8][1], results[9][1], results[10][1])

print("Coefficient: {}\n".format(c))
print("Elbow Calculation: {}\n".format(c/results[9][1]))
print("Trn-Val Difference of ID: {}".format(abs(results[9][1]-results[2][1])))

Result of 25: 
	(trn) 0.005273125134408474 	(val) 0.005240283906459808

Result of 30: 
	(trn) 0.00500495545566082 	(val) 0.004959446378052235

Result of 35: 
	(trn) 0.004839451517909765 	(val) 0.004759068135172129

Coefficient: 0.4163999910973744

Elbow Calculation: 83.96098260889165

Trn-Val Difference of ID: 4.550907760858536e-05


### Classifier(s) ###

In [18]:
ID = 30

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(shape,)),
    tf.keras.layers.Dense(ID, activation='relu'),
    tf.keras.layers.Dense(5)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

start_time = time.time()

model.fit(X_train, y_train, epochs=10)

clear_output(wait=True)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.5460762977600098 seconds ---


In [23]:
test_loss, test_acc = model.evaluate(X_test,  y_test)
f1 = f1_score(y_test, [np.argmax(a) for a in model.predict(X_test)], average='weighted')

print('\nTest accuracy:', test_acc)
print('\nTest F1 (weighted):', f1)


Test accuracy: 1.0

Test F1 (weighted): 1.0
