In [1]:
import tensorflow as tf
from IPython.display import clear_output
tf.keras.utils.set_random_seed(42)
import skdim
import time
import numpy as np
import pickle
import pandas as pd

### Import data and obtain the number of labels:

In [2]:
filename = 'data/dataset1.pkl'

with open(filename, 'rb') as f:
    data_tuples = pickle.load(f)
f.close()

In [3]:
print('Number of Data Points:', len(data_tuples))
class_labels = []
for i in range(len(data_tuples)):
    class_labels.append(data_tuples[i][1])

print('Class Labels:', len(set(class_labels)))
print('Feature Count:', len(data_tuples[0][0]))

Number of Data Points: 1000
Class Labels: 2
Feature Count: 200


In [4]:
data = pd.DataFrame([i[0] for i in data_tuples])
data = (data - np.min(data)) / (np.max(data) - np.min(data))

### Perform Estimation(s)

In [5]:
# fit() values

for s in ['FO', 'Fan', 'maxgap', 'ratio', 'participation_ratio', 'Kaiser', 'broken_stick']:
    est = skdim.id.lPCA(ver=s).fit(data)
    print("PCA {}".format(s), est.dimension_)

est = skdim.id.CorrInt().fit(data)
print("CorrInt", est.dimension_)

est = skdim.id.DANCo().fit(data)
print("DANCo", est.dimension_)

est = skdim.id.ESS().fit(data)
print("ESS", est.dimension_)

est = skdim.id.FisherS().fit(data)
print("FisherS", est.dimension_)

est = skdim.id.KNN().fit(data)
print("KNN", est.dimension_)

est = skdim.id.MADA().fit(data)
print("MADA", est.dimension_)

est = skdim.id.MiND_ML().fit(data)
print("MiND_ML", est.dimension_)

est = skdim.id.MLE().fit(data)
print("MLE", est.dimension_)

est = skdim.id.MOM().fit(data)
print("MOM", est.dimension_)

est = skdim.id.TLE().fit(data)
print("TLE", est.dimension_)

est = skdim.id.TwoNN().fit(data)
print("TwoNN", est.dimension_)

PCA FO 132
PCA Fan 79
PCA maxgap 132
PCA ratio 2
PCA participation_ratio 97.55364888782775
PCA Kaiser 99
PCA broken_stick 2
CorrInt 32.823726962564436
DANCo 167.99564473175613
ESS 123.11232864794208
FisherS nan
KNN 21
MADA 43.10570474305857
MiND_ML 10.0
MLE 42.53064371067954
MOM 34.445209671826916
TLE 38.545575927556676
TwoNN 55.00943825384689


In [7]:
# fit_pw(data, n_neighbors=100, n_jobs=-1)

# for s in ['FO', 'Fan', 'maxgap', 'ratio', 'participation_ratio', 'Kaiser', 'broken_stick']:
#     est = skdim.id.lPCA(ver=s).fit_pw(data, n_neighbors=100, n_jobs=-1)
#     print("PCA {}".format(s), np.mean(est.dimension_pw_))
    
# est = skdim.id.CorrInt().fit_pw(data, n_neighbors=100, n_jobs=-1)
# print("CorrInt", np.mean(est.dimension_pw_))

# est = skdim.id.DANCo().fit_pw(data, n_neighbors=100, n_jobs=-1)
# print("DANCo", np.mean(est.dimension_pw_))

est = skdim.id.FisherS().fit_pw(data, n_neighbors=100, n_jobs=-1)
print("FisherS", np.mean(est.dimension_pw_))

est = skdim.id.KNN().fit_pw(data, n_neighbors=100, n_jobs=-1)
print("KNN", np.mean(est.dimension_pw_))

est = skdim.id.MiND_ML().fit_pw(data, n_neighbors=100, n_jobs=-1)
print("MiND_ML", np.mean(est.dimension_pw_))

est = skdim.id.TwoNN().fit_pw(data, n_neighbors=100, n_jobs=-1)
print("TwoNN", np.mean(est.dimension_pw_))

FisherS nan
KNN 55.388
MiND_ML 10.0
TwoNN 44.55634198989536
