In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings

from sklearn.datasets import fetch_olivetti_faces
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import adjusted_rand_score
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import homogeneity_completeness_v_measure
from scipy.spatial import distance

from AdaptiveKLLE import *

from sklearn.datasets import load_iris

In [2]:
import random
np.random.seed(0)
random.seed(0)


n_osservazioni = 1700
n_variabili = 17
R, r = 2, 1  
a, b = 0.1, 0.1  
r_sfera = 1  


theta = np.linspace(0, 2*np.pi, n_osservazioni)
phi = np.linspace(0, 2*np.pi, n_osservazioni)
x_toro = (R + r * np.cos(theta)) * np.cos(phi)
y_toro = (R + r * np.cos(theta)) * np.sin(phi)
z_toro = r * np.sin(theta)
toro = np.column_stack((x_toro, y_toro, z_toro))
toro = np.hstack([toro]+[np.random.normal(0, 0.05, (n_osservazioni, 3))] + [np.random.normal(0, 0.05, (n_osservazioni, n_variabili - 3))])

t = np.linspace(0, 20*np.pi, n_osservazioni)
x_spirale = a * t * np.cos(t)
y_spirale = a * t * np.sin(t)
z_spirale = b * t
spirale_3d = np.column_stack((x_spirale, y_spirale, z_spirale))
spirale_3d = np.hstack([spirale_3d]+[np.random.normal(0, 0.05, (n_osservazioni, 3))] + [np.random.normal(0, 0.05, (n_osservazioni, n_variabili - 3))])

np.random.seed(0)
phi = np.random.uniform(0, np.pi, n_osservazioni)
theta = np.random.uniform(0, 2*np.pi, n_osservazioni)
x_sfera = r_sfera * np.sin(phi) * np.cos(theta)
y_sfera = r_sfera * np.sin(phi) * np.sin(theta)
z_sfera = r_sfera * np.cos(phi)
sfera = np.column_stack((x_sfera, y_sfera, z_sfera))
sfera = np.hstack([sfera]+[np.random.normal(0, 0.05, (n_osservazioni, 3))] + [np.random.normal(0, 0.05, (n_osservazioni, n_variabili - 3))])

dataframes_complessi = [pd.DataFrame(toro), pd.DataFrame(spirale_3d), pd.DataFrame(sfera)]
X = pd.concat(dataframes_complessi, ignore_index=True).values.astype(float)
X.shape

y = np.repeat(np.arange(0, 3), n_osservazioni)

df_ = pd.DataFrame(np.column_stack((y, X))).sample(frac = 1, random_state = 0)
X = df_.iloc[:, 1:].values
y = df_.iloc[:, 0].values

In [3]:
def return_ids_kstar_binomial(data, embeddings, initial_id=None, Dthr=6.67, r='opt', n_iter = 10):
    if initial_id is None:
        data.compute_id_2NN(algorithm='base')
    else:
        data.compute_distances()
        data.set_id(initial_id)

    ids = np.zeros(n_iter)
    ids_err = np.zeros(n_iter)
    kstars = np.zeros((n_iter, data.N), dtype=int)
    log_likelihoods = np.zeros(n_iter)
    ks_stats = np.zeros(n_iter)
    p_values = np.zeros(n_iter)

    for i in range(n_iter):

      data.compute_kstar(Dthr)

      r_eff = min(0.95,0.2032**(1./data.intrinsic_dim)) if r == 'opt' else r
      
      rk = np.array([dd[data.kstar[j]] for j, dd in enumerate(data.distances)])
      rn = rk * r_eff
      n = np.sum([dd < rn[j] for j, dd in enumerate(data.distances)], axis=1)
      
      id = np.log((n.mean() - 1) / (data.kstar.mean() - 1)) / np.log(r_eff)
      
      id_err = ut._compute_binomial_cramerrao(id, data.kstar-1, r_eff, data.N)
      
      log_lik = ut.binomial_loglik(id, data.kstar - 1, n - 1, r_eff)
      
      n_model = rng.binomial(data.kstar-1, r_eff**id, size=len(n))
      ks, pv = ks_2samp(n-1, n_model)
      
      data.set_id(id)

      ids[i] = id
      ids_err[i] = id_err
      kstars[i] = data.kstar
      log_likelihoods[i] = log_lik
      ks_stats[i] = ks
      p_values[i] = pv

    data.intrinsic_dim = id
    data.intrinsic_dim_err = id_err
    data.intrinsic_dim_scale = 0.5 * (rn.mean() + rk.mean())

    return ids, kstars[(n_iter - 1), :]


def find_single_k_neighs(embeddings, index, k):
    target_embedding = embeddings[index]
    all_distances = np.array([distance.euclidean(target_embedding, emb) for emb in embeddings])

    nearest_indices = np.argsort(all_distances)[1:k+1]  

    return nearest_indices.tolist()

def find_adaptive_test(id_, X_test):
    data = Data(X_test)
    data.compute_id_2NN(algorithm='base')
    kstars_test = np.zeros(X_test.shape[0], dtype=int)
    Dthr = 6.67
    data.compute_kstar(Dthr)


    r_eff = min(0.95,0.2032**(1./id_)) if r == 'opt' else r

    rk = np.array([dd[data.kstar[j]] for j, dd in enumerate(data.distances)])
    rn = rk * r_eff
    n = np.sum([dd < rn[j] for j, dd in enumerate(data.distances)], axis=1)

    id = np.log((n.mean() - 1) / (data.kstar.mean() - 1)) / np.log(r_eff)

    id_err = ut._compute_binomial_cramerrao(id, data.kstar-1, r_eff, data.N)

    log_lik = ut.binomial_loglik(id, data.kstar - 1, n - 1, r_eff)

    n_model = rng.binomial(data.kstar-1, r_eff**id, size=len(n))
    ks, pv = ks_2samp(n-1, n_model)
    return data.kstar

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

folds = 3
n_samples = X.shape[0]
cv_n = n_samples // folds
n_iter = 15
r = 'opt'
#id_ = 3

accuracy_llestar = []
f1_llestar = []
accuracy_lle_no_hyper = []
f1_lle_no_hyper = []
accuracy_lle_comp = []
f1_lle_comp = []
accuracy_lle_same = []
f1_lle_same = []
for i in range(folds):
    start_test = cv_n * i
    end_test = start_test + cv_n

    X_test = X[start_test:end_test, :]
    y_test = y[start_test:end_test]

    X_train = np.vstack((X[:start_test, :], X[end_test:, :]))
    y_train = np.concatenate((y[:start_test], y[end_test:]))

    k_star_lle = K_starLLE(X = X_train, initial_id = None, n_iter = n_iter)
    Y_kstar, W, kstars = k_star_lle.calculate_embedding(initial_id=None, Dthr=6.67, r='opt')
    id_ = k_star_lle.return_ids_kstar_binomial(verbose = False)[0][n_iter-1]
    
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(Y_kstar, y_train)
    
    W = np.zeros((X_test.shape[0], X_train.shape[0]))
    for i in tqdm(range(X_test.shape[0])):
    
        new_data = np.concatenate((X_test[i, :].reshape(1, -1), X_train))
        #print(embs.shape)
        data = Data(new_data)
        data.set_id(id_)
        data.compute_id_2NN(algorithm='base')
        data.compute_kstar(Dthr=6.67)
        k_s = data.kstar
        nns = find_single_k_neighs(new_data, 0, k_s[0])
        nns = np.array(nns) - 1
        Z = X_train[nns] - X_test[i]  
        C = np.dot(Z, Z.T)  
        trace = np.trace(C)
        if trace > 0:
            R = 1e-3 * trace
        else:
            R = 1e-3
        C.flat[:: len(nns) + 1] += R
        w = solve(C, np.ones(len(nns)), assume_a="pos")  
        W[i, nns] = w / np.sum(w)

        Y_kstar_test = np.dot(W, Y_kstar)
    
    preds_knn = knn.predict(Y_kstar_test)
    
    accuracy_llestar.append(accuracy_score(y_test, preds_knn))
    f1_llestar.append(f1_score(y_test, preds_knn, average='weighted'))

iteration  0
id  [9.21]
iteration  1
id  [5.66]
iteration  2
id  [4.45]
iteration  3
id  [3.73]
iteration  4
id  [3.19]
iteration  5
id  [2.84]
iteration  6
id  [2.7]
iteration  7
id  [2.67]
iteration  8
id  [2.67]
iteration  9
id  [2.67]
iteration  10
id  [2.67]
iteration  11
id  [2.67]
iteration  12
id  [2.67]
iteration  13
id  [2.67]
iteration  14
id  [2.67]


100%|███████████████████████████████████████| 1700/1700 [08:35<00:00,  3.30it/s]


iteration  0
id  [8.78]
iteration  1
id  [5.6]
iteration  2
id  [4.54]
iteration  3
id  [3.87]
iteration  4
id  [3.37]
iteration  5
id  [2.98]
iteration  6
id  [2.77]
iteration  7
id  [2.68]
iteration  8
id  [2.67]
iteration  9
id  [2.66]
iteration  10
id  [2.66]
iteration  11
id  [2.66]
iteration  12
id  [2.66]
iteration  13
id  [2.66]
iteration  14
id  [2.66]


100%|███████████████████████████████████████| 1700/1700 [08:30<00:00,  3.33it/s]


iteration  0
id  [9.03]
iteration  1
id  [5.89]
iteration  2
id  [4.73]
iteration  3
id  [4.04]
iteration  4
id  [3.53]
iteration  5
id  [3.15]
iteration  6
id  [2.93]
iteration  7
id  [2.83]
iteration  8
id  [2.8]
iteration  9
id  [2.79]
iteration  10
id  [2.79]
iteration  11
id  [2.79]
iteration  12
id  [2.79]
iteration  13
id  [2.79]
iteration  14
id  [2.79]


100%|███████████████████████████████████████| 1700/1700 [08:22<00:00,  3.38it/s]


In [5]:
print(np.mean(accuracy_llestar))
print(np.mean(f1_llestar))

0.9519607843137253
0.9518052264235131


In [6]:
#Using the Non-Adaptive version

warnings.filterwarnings("ignore")

folds = 3
n_samples = X.shape[0]
cv_n = n_samples // folds
n_iter = 10
r = 'opt'

accuracy_llestar = []
f1_llestar = []
accuracy_lle_no_hyper = []
f1_lle_no_hyper = []
accuracy_lle_comp = []
f1_lle_comp = []
accuracy_lle_same = []
f1_lle_same = []
predictions = []
for i in range(folds):
    start_test = cv_n * i
    end_test = start_test + cv_n

    X_test = X[start_test:end_test, :]
    y_test = y[start_test:end_test]

    X_train = np.vstack((X[:start_test, :], X[end_test:, :]))
    y_train = np.concatenate((y[:start_test], y[end_test:]))

    ###no hyper
    lle = LocallyLinearEmbedding(random_state=0)
    y_lle = lle.fit_transform(X_train)
    
    knn_lle = KNeighborsClassifier(n_jobs=-1)
    knn_lle.fit(y_lle, y_train)
    
    y_lle_test = lle.transform(X_test)
    
    preds_knn_lle = knn_lle.predict(y_lle_test)
    
    predictions.append(preds_knn_lle)
    accuracy_lle_no_hyper.append(accuracy_score(y_test, preds_knn_lle))
    f1_lle_no_hyper.append(f1_score(y_test, preds_knn_lle, average='weighted'))
    
    ### same
    lle = LocallyLinearEmbedding(n_components=int(np.round(id_)), n_neighbors=int(np.median(kstars)), random_state=0)
    y_lle = lle.fit_transform(X_train)
    
    knn_lle = KNeighborsClassifier(n_jobs=-1)
    knn_lle.fit(y_lle, y_train)
    
    y_lle_test = lle.transform(X_test)
    
    preds_knn_lle = knn_lle.predict(y_lle_test)
    predictions.append(preds_knn_lle)
    
    accuracy_lle_same.append(accuracy_score(y_test, preds_knn_lle))
    f1_lle_same.append(f1_score(y_test, preds_knn_lle, average='weighted'))

In [7]:
print(np.mean(accuracy_lle_no_hyper), np.mean(accuracy_lle_same))
print(np.mean(f1_lle_no_hyper),  np.mean(f1_lle_same))

0.8713725490196077 0.9052941176470588
0.8735548406058928 0.9048643896207628
