In [1]:
from AdaptiveKLLE import *
import pandas as pd 
import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, homogeneity_completeness_v_measure
from sklearn.manifold import LocallyLinearEmbedding
from scipy.spatial import distance

import seaborn as sns
sns.set(style = "darkgrid")
from AdaptiveKLLE import *

plt.style.use('ggplot')

In [2]:
def return_ids_kstar_binomial(data, embeddings, initial_id=None, Dthr=6.67, r='opt', n_iter = 10):
    if initial_id is None:
        data.compute_id_2NN(algorithm='base')
    else:
        data.compute_distances()
        data.set_id(initial_id)

    ids = np.zeros(n_iter)
    ids_err = np.zeros(n_iter)
    kstars = np.zeros((n_iter, data.N), dtype=int)
    log_likelihoods = np.zeros(n_iter)
    ks_stats = np.zeros(n_iter)
    p_values = np.zeros(n_iter)

    for i in range(n_iter):
      # compute kstar
      data.compute_kstar(Dthr)
      # print("iteration ", i)
      # print("id ", data.intrinsic_dim)

      # set new ratio
      r_eff = min(0.95,0.2032**(1./data.intrinsic_dim)) if r == 'opt' else r
      # compute neighbourhoods shells from k_star
      rk = np.array([dd[data.kstar[j]] for j, dd in enumerate(data.distances)])
      rn = rk * r_eff
      n = np.sum([dd < rn[j] for j, dd in enumerate(data.distances)], axis=1)
      # compute id
      id = np.log((n.mean() - 1) / (data.kstar.mean() - 1)) / np.log(r_eff)
      # compute id error
      id_err = ut._compute_binomial_cramerrao(id, data.kstar-1, r_eff, data.N)
      # compute likelihood
      log_lik = ut.binomial_loglik(id, data.kstar - 1, n - 1, r_eff)
      # model validation through KS test
      n_model = rng.binomial(data.kstar-1, r_eff**id, size=len(n))
      ks, pv = ks_2samp(n-1, n_model)
      # set new id
      data.set_id(id)

      ids[i] = id
      ids_err[i] = id_err
      kstars[i] = data.kstar
      log_likelihoods[i] = log_lik
      ks_stats[i] = ks
      p_values[i] = pv

    data.intrinsic_dim = id
    data.intrinsic_dim_err = id_err
    data.intrinsic_dim_scale = 0.5 * (rn.mean() + rk.mean())

    return ids, kstars[(n_iter - 1), :]#, ids_err, log_likelihoods, ks_stats, p_values

def find_single_k_neighs(embeddings, index, k):
    target_embedding = embeddings[index]
    all_distances = np.array([distance.minkowski(target_embedding, emb) for emb in embeddings])

    nearest_indices = np.argsort(all_distances)[1:k+1]  

    return nearest_indices.tolist()

In [3]:
data = pd.read_csv('mnist_test.csv')
X = data.iloc[:, 1:].values.astype(np.float32)
y = data.iloc[:, 0].values

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

folds = 3
n_samples = X.shape[0]
cv_n = n_samples // folds
n_iter = 15
r = 'opt'
#id_ = 3

accuracy_llestar = []
f1_llestar = []
accuracy_lle_no_hyper = []
f1_lle_no_hyper = []
accuracy_lle_comp = []
f1_lle_comp = []
accuracy_lle_same = []
f1_lle_same = []
for i in range(folds):
    start_test = cv_n * i
    end_test = start_test + cv_n

    X_test = X[start_test:end_test, :]
    y_test = y[start_test:end_test]

    X_train = np.vstack((X[:start_test, :], X[end_test:, :]))
    y_train = np.concatenate((y[:start_test], y[end_test:]))

    k_star_lle = K_starLLE(X = X_train, initial_id = None, n_iter = n_iter)
    Y_kstar, W, kstars = k_star_lle.calculate_embedding(initial_id=None, Dthr=6.67, r='opt')
    id_ = k_star_lle.return_ids_kstar_binomial(verbose = False)[0][n_iter-1]
    
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(Y_kstar, y_train)
    
    W = np.zeros((X_test.shape[0], X_train.shape[0]))
    for i in range(X_test.shape[0]):
    
        new_data = np.concatenate((X_test[i, :].reshape(1, -1), X_train))
        #print(embs.shape)
        data = Data(new_data)
        data.set_id(id_)
        data.compute_id_2NN(algorithm='base')
        data.compute_kstar(Dthr=6.67)
        k_s = data.kstar
        nns = find_single_k_neighs(new_data, 0, k_s[0])
        nns = np.array(nns) - 1
        Z = X_train[nns] - X_test[i]  
        C = np.dot(Z, Z.T)  
        trace = np.trace(C)
        if trace > 0:
            R = 1e-3 * trace
        else:
            R = 1e-3
        C.flat[:: len(nns) + 1] += R
        w = solve(C, np.ones(len(nns)), assume_a="pos")  
        W[i, nns] = w / np.sum(w)

        Y_kstar_test = np.dot(W, Y_kstar)
    
    preds_knn = knn.predict(Y_kstar_test)
    
    accuracy_llestar.append(accuracy_score(y_test, preds_knn))
    f1_llestar.append(f1_score(y_test, preds_knn, average='weighted'))

iteration  0
id  [12.84]
iteration  1
id  [10.93]
iteration  2
id  [10.9]
iteration  3
id  [10.91]
iteration  4
id  [10.91]
iteration  5
id  [10.91]
iteration  6
id  [10.91]
iteration  7
id  [10.91]
iteration  8
id  [10.91]
iteration  9
id  [10.91]
iteration  10
id  [10.91]
iteration  11
id  [10.91]
iteration  12
id  [10.91]
iteration  13
id  [10.91]
iteration  14
id  [10.91]
iteration  0
id  [13.28]
iteration  1
id  [11.36]
iteration  2
id  [11.26]
iteration  3
id  [11.25]
iteration  4
id  [11.24]
iteration  5
id  [11.25]
iteration  6
id  [11.25]
iteration  7
id  [11.25]
iteration  8
id  [11.25]
iteration  9
id  [11.25]
iteration  10
id  [11.25]
iteration  11
id  [11.25]
iteration  12
id  [11.25]
iteration  13
id  [11.25]
iteration  14
id  [11.25]
iteration  0
id  [13.48]
iteration  1
id  [11.71]
iteration  2
id  [11.59]
iteration  3
id  [11.55]
iteration  4
id  [11.54]
iteration  5
id  [11.54]
iteration  6
id  [11.53]
iteration  7
id  [11.53]
iteration  8
id  [11.53]
iteration  9
id 

In [5]:
print(np.mean(accuracy_llestar))
print(np.mean(f1_llestar))

0.9207920792079207
0.9206165004421877


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import warnings

warnings.filterwarnings("ignore")

folds = 3
n_samples = X.shape[0]
cv_n = n_samples // folds
n_iter = 15
r = 'opt'

accuracy_llestar = []
f1_llestar = []
accuracy_lle_no_hyper = []
f1_lle_no_hyper = []
accuracy_lle_comp = []
f1_lle_comp = []
accuracy_lle_same = []
f1_lle_same = []
for i in range(folds):
    start_test = cv_n * i
    end_test = start_test + cv_n

    X_test = X[start_test:end_test, :]
    y_test = y[start_test:end_test]

    X_train = np.vstack((X[:start_test, :], X[end_test:, :]))
    y_train = np.concatenate((y[:start_test], y[end_test:]))

    
    ###no hyper
    lle = LocallyLinearEmbedding(random_state=0)
    y_lle = lle.fit_transform(X_train)
    
    knn_lle = KNeighborsClassifier(n_jobs=-1)
    knn_lle.fit(y_lle, y_train)
    
    y_lle_test = lle.transform(X_test)
    
    preds_knn_lle = knn_lle.predict(y_lle_test)
    
    accuracy_lle_no_hyper.append(accuracy_score(y_test, preds_knn_lle))
    f1_lle_no_hyper.append(f1_score(y_test, preds_knn_lle, average='weighted'))
    
    ### same
        
    lle = LocallyLinearEmbedding(n_components=int(np.round(id_)), n_neighbors=int(np.median(kstars)), random_state=0)
    y_lle = lle.fit_transform(X_train)
    
    knn_lle = KNeighborsClassifier(n_jobs=-1)
    knn_lle.fit(y_lle, y_train)
    
    y_lle_test = lle.transform(X_test)
    
    preds_knn_lle = knn_lle.predict(y_lle_test)
    
    accuracy_lle_same.append(accuracy_score(y_test, preds_knn_lle))
    f1_lle_same.append(f1_score(y_test, preds_knn_lle, average='weighted'))

In [10]:
print(np.mean(accuracy_lle_no_hyper), np.mean(accuracy_lle_same))
print(np.mean(f1_lle_no_hyper), np.mean(f1_lle_same))

0.7898789878987899 0.9212921292129214
0.789381729015381 0.9211029388283762
