In [1]:
import tensorflow as tf
from IPython.display import clear_output
tf.keras.utils.set_random_seed(42)
import skdim
import time
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import f1_score

### Import data and obtain the number of labels:

In [2]:
filename = 'data/dataset2.pkl'

with open(filename, 'rb') as f:
    data_tuples = pickle.load(f)
f.close()

In [3]:
print('Number of Data Points:', len(data_tuples))
class_labels = []
for i in range(len(data_tuples)):
    class_labels.append(data_tuples[i][1])

print('Class Labels:', len(set(class_labels)))
print('Feature Count:', len(data_tuples[0][0]))

Number of Data Points: 1000
Class Labels: 2
Feature Count: 150


### Perform Estimation

In [4]:
data = pd.DataFrame([i[0] for i in data_tuples])
data = (data - np.min(data)) / (np.max(data) - np.min(data))
labels = pd.DataFrame([i[1] for i in data_tuples])

In [5]:
start_time = time.time()


#estimate local intrinsic dimension (dimension in k-nearest-neighborhoods around each point):
est = skdim.id.MOM().fit(data,
                              n_neighbors = 50,
                              n_jobs = -1,)

print("--- %s seconds ---" % (time.time() - start_time))

#get estimated intrinsic dimension
print(np.mean(est.dimension_pw_))

--- 0.18147540092468262 seconds ---
26.26175147302017


### Perform Dimensionality Reduction (Autoencoder Testing) ###

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [7]:
ds_tr = tf.data.Dataset.from_tensor_slices(X_train)
ds_te = tf.data.Dataset.from_tensor_slices(X_test)

In [8]:
def coefficient(low, id, high):
    return (high-id)/(high-low)

In [10]:
ID = 25

In [11]:
tf.keras.utils.set_random_seed(42) # extra code – ensures reproducibility on CPU

radius = 5

shape = len(data_tuples[0][0])

input = tf.keras.layers.Input(shape=(shape,))

latent_minus = tf.keras.layers.Dense(max(ID-radius, 1), activation="relu")(input)
latent_id = tf.keras.layers.Dense(ID, activation="relu")(input)
latent_plus = tf.keras.layers.Dense(ID+radius, activation="relu")(input)

output_minus = tf.keras.layers.Dense(shape)(latent_minus)
output_id = tf.keras.layers.Dense(shape)(latent_id)
output_plus = tf.keras.layers.Dense(shape)(latent_plus)

model = tf.keras.Model(inputs=[input],
                       outputs=[output_minus, output_id, output_plus])

model.compile(loss="mse", optimizer="nadam",
              metrics=["RootMeanSquaredError"])
history = model.fit(X_train, X_train, epochs=10,
                         validation_data=(X_test, X_test))

results = []

for i in history.history:
    results.append((i, history.history[i][-1]))

# 'ID - radius' validation loss
(ID-radius), results[8][1]

clear_output(wait=True)

print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(max(ID-radius, 1), results[1][1], results[8][1]))
print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(ID, results[2][1], results[9][1]))
print("Result of {}: \n\t(trn) {} \t(val) {}\n".format(ID+radius, results[3][1], results[10][1]))

c = coefficient(results[8][1], results[9][1], results[10][1])

print("Coefficient: {}\n".format(c))
print("Elbow Calculation: {}\n".format(c/results[9][1]))
print("Trn-Val Difference of ID: {}".format(abs(results[9][1]-results[2][1])))

Result of 20: 
	(trn) 0.007567799650132656 	(val) 0.007547427900135517

Result of 25: 
	(trn) 0.008026986382901669 	(val) 0.008076051250100136

Result of 30: 
	(trn) 0.007932948879897594 	(val) 0.007963971234858036

Coefficient: -0.2690716809014891

Elbow Calculation: -33.31723296061956

Trn-Val Difference of ID: 4.9064867198467255e-05


### Classifier(s) ###

In [31]:
ID = 25

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(shape,)),
    tf.keras.layers.Dense(ID, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

start_time = time.time()

model.fit(X_train, y_train, epochs=10)

clear_output(wait=True)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.36318373680114746 seconds ---


In [34]:
test_loss, test_acc = model.evaluate(X_test,  y_test)
f1_w = f1_score(y_test, [0 if x < 0 else 1 for x in model.predict(X_test)], average='weighted')
f1_m = f1_score(y_test, [0 if x < 0 else 1 for x in model.predict(X_test)], average='macro')

print('\nTest accuracy:', test_acc)
print('\nTest F1 (macro):', f1_m)
print('\nTest F1 (weighted):', f1_w)


Test accuracy: 1.0

Test F1 (macro): 1.0

Test F1 (weighted): 1.0
