In [8]:
import tensorflow as tf
from IPython.display import clear_output
tf.keras.utils.set_random_seed(42)
import skdim
import time
import numpy as np
import pickle
import pandas as pd


### Import data and obtain the number of labels:

In [57]:
filename = 'data/dataset6.pkl'

with open(filename, 'rb') as f:
    data_tuples = pickle.load(f)
f.close()

In [58]:
print('Number of Data Points:', len(data_tuples))
class_labels = []
for i in range(len(data_tuples)):
    class_labels.append(data_tuples[i][1])

print('Class Labels:', len(set(class_labels)))
print('Feature Count:', len(data_tuples[0][0]))

Number of Data Points: 3000
Class Labels: 2
Feature Count: 28


In [59]:
data = pd.DataFrame([i[0] for i in data_tuples])
data = (data - np.min(data)) / (np.max(data) - np.min(data))
labels = pd.DataFrame([i[1] for i in data_tuples])

In [60]:
neighbor_count = 150

### Perform Estimation(s)

In [61]:
# fit() values

for s in ['FO', 'Fan', 'maxgap', 'ratio', 'participation_ratio', 'Kaiser', 'broken_stick']:
    est = skdim.id.lPCA(ver=s).fit(data)
    print("PCA {}".format(s), est.dimension_)

est = skdim.id.CorrInt().fit(data)
print("CorrInt", est.dimension_)

est = skdim.id.DANCo().fit(data)
print("DANCo", est.dimension_)

est = skdim.id.ESS().fit(data, n_neighbors=neighbor_count)
print("ESS", est.dimension_)

est = skdim.id.FisherS().fit(data)
print("FisherS", est.dimension_)

est = skdim.id.KNN().fit(data)
print("KNN", est.dimension_)

est = skdim.id.MADA().fit(data, n_neighbors=neighbor_count)
print("MADA", est.dimension_)

est = skdim.id.MiND_ML().fit(data)
print("MiND_ML", est.dimension_)

est = skdim.id.MLE().fit(data, n_neighbors=neighbor_count)
print("MLE", est.dimension_)

est = skdim.id.MOM().fit(data, n_neighbors=neighbor_count)
print("MOM", est.dimension_)

est = skdim.id.TLE().fit(data, n_neighbors=neighbor_count)
print("TLE", est.dimension_)

est = skdim.id.TwoNN().fit(data)
print("TwoNN", est.dimension_)

PCA FO 20
PCA Fan 10
PCA maxgap 20
PCA ratio 1
PCA participation_ratio 14.522679330961648
PCA Kaiser 11
PCA broken_stick 1
CorrInt 5.404631104343507
DANCo 10.943091748823164
ESS 14.280929505227324
FisherS 13.337338446223308
KNN 2
MADA 11.250680860571737
MiND_ML 10.0
MLE 10.976296037705103
MOM 11.588721113204675
TLE 8.220299023534663
TwoNN 12.097252644210192


In [None]:
# fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)

for s in ['FO', 'Fan', 'maxgap', 'ratio', 'participation_ratio', 'Kaiser', 'broken_stick']:
    est = skdim.id.lPCA(ver=s).fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
    print("PCA {}".format(s), np.mean(est.dimension_pw_))
    
est = skdim.id.CorrInt().fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
print("CorrInt", np.mean(est.dimension_pw_))

est = skdim.id.FisherS().fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
print("FisherS", np.mean(est.dimension_pw_))

est = skdim.id.KNN().fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
print("KNN", np.mean(est.dimension_pw_))

est = skdim.id.MiND_ML().fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
print("MiND_ML", np.mean(est.dimension_pw_))

est = skdim.id.TwoNN().fit_pw(data, n_neighbors=neighbor_count, n_jobs=-1)
print("TwoNN", np.mean(est.dimension_pw_))

PCA FO 19.297
PCA Fan 9.113333333333333
PCA maxgap 20.0
PCA ratio 1.0
PCA participation_ratio 12.624904241935393
PCA Kaiser 10.538
PCA broken_stick 6.629333333333333


### Reduction Test ###

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [57]:
# PCA Variance
var = 0.95

pca = PCA(n_components=var)
X_reduced = pca.fit_transform(data)
print("Dimension required for {}% variance explained: {}".format(var*100, pca.n_components_))

Dimension required for 95.0% variance explained: 70


In [69]:
# Non-dimensional Reduction
var = 0.95

lle = LocallyLinearEmbedding(n_components=1, n_neighbors=75, random_state=42, n_jobs=-1)
X_unrolled = lle.fit_transform(data)
lle.reconstruction_error_

1.1862753443504776e-06