In [1]:
from AdaptiveKLLE import *
import pandas as pd 
import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, homogeneity_completeness_v_measure
from sklearn.manifold import LocallyLinearEmbedding
from scipy.spatial import distance

import seaborn as sns
sns.set(style = "darkgrid")
from AdaptiveKLLE import *

plt.style.use('ggplot')

In [2]:
def return_ids_kstar_binomial(data, embeddings, initial_id=None, Dthr=6.67, r='opt', n_iter = 10):
    if initial_id is None:
        data.compute_id_2NN(algorithm='base')
    else:
        data.compute_distances()
        data.set_id(initial_id)

    ids = np.zeros(n_iter)
    ids_err = np.zeros(n_iter)
    kstars = np.zeros((n_iter, data.N), dtype=int)
    log_likelihoods = np.zeros(n_iter)
    ks_stats = np.zeros(n_iter)
    p_values = np.zeros(n_iter)

    for i in range(n_iter):
      # compute kstar
      data.compute_kstar(Dthr)
      # print("iteration ", i)
      # print("id ", data.intrinsic_dim)

      # set new ratio
      r_eff = min(0.95,0.2032**(1./data.intrinsic_dim)) if r == 'opt' else r
      # compute neighbourhoods shells from k_star
      rk = np.array([dd[data.kstar[j]] for j, dd in enumerate(data.distances)])
      rn = rk * r_eff
      n = np.sum([dd < rn[j] for j, dd in enumerate(data.distances)], axis=1)
      # compute id
      id = np.log((n.mean() - 1) / (data.kstar.mean() - 1)) / np.log(r_eff)
      # compute id error
      id_err = ut._compute_binomial_cramerrao(id, data.kstar-1, r_eff, data.N)
      # compute likelihood
      log_lik = ut.binomial_loglik(id, data.kstar - 1, n - 1, r_eff)
      # model validation through KS test
      n_model = rng.binomial(data.kstar-1, r_eff**id, size=len(n))
      ks, pv = ks_2samp(n-1, n_model)
      # set new id
      data.set_id(id)

      ids[i] = id
      ids_err[i] = id_err
      kstars[i] = data.kstar
      log_likelihoods[i] = log_lik
      ks_stats[i] = ks
      p_values[i] = pv

    data.intrinsic_dim = id
    data.intrinsic_dim_err = id_err
    data.intrinsic_dim_scale = 0.5 * (rn.mean() + rk.mean())

    return ids, kstars[(n_iter - 1), :]#, ids_err, log_likelihoods, ks_stats, p_values

def find_single_k_neighs(embeddings, index, k):
    target_embedding = embeddings[index]
    all_distances = np.array([distance.minkowski(target_embedding, emb) for emb in embeddings])

    nearest_indices = np.argsort(all_distances)[1:k+1]  

    return nearest_indices.tolist()

In [3]:
data = pd.read_csv('mnist_test.csv')
X = data.iloc[:, 1:].values.astype(np.float32)
y = data.iloc[:, 0].values

In [5]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

folds = 3
n_iter = 15
r = 'opt'
random_state = 0


accuracy_llestar = []
accuracy_lle_no_hyper = []
accuracy_lle_comp = []
accuracy_lle_same = []
ids_ = []
num_kstars = []


kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)


for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold_idx + 1}/{folds}")
    

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    print(f"  Train size: {len(train_idx)}, Test size: {len(test_idx)}")
    

    print("  Calculate K-star LLE...")
    k_star_lle = K_starLLE(X=X_train, initial_id=None, n_iter=n_iter)
    Y_kstar, W_train, kstars = k_star_lle.calculate_embedding(initial_id=None, Dthr=6.67, r='opt')
    

    id_ = k_star_lle.return_ids_kstar_binomial(verbose=False)[0][n_iter-1]
    ids_.append(id_)
    num_kstars.append(int(np.round(np.median(kstars))))
    
    print(f"  ID optimal: {id_:.2f}, K-star median: {int(np.round(np.median(kstars)))}")
    

    clf = LogisticRegression(n_jobs=-1, random_state=0, penalty = None)
    clf.fit(Y_kstar, y_train)
    

    W = np.zeros((X_test.shape[0], X_train.shape[0]))
    
    for i in tqdm(range(X_test.shape[0]), desc="    Campioni"):

        new_data = np.concatenate((X_test[i, :].reshape(1, -1), X_train))
        

        data = Data(new_data)
        data.set_id(id_)
        data.compute_id_2NN(algorithm='base')
        data.compute_kstar(Dthr=6.67)
        k_s = data.kstar
        
        
        nns = find_single_k_neighs(new_data, 0, k_s[0])
        nns = np.array(nns) - 1 
        
        Z = X_train[nns] - X_test[i]  
        C = np.dot(Z, Z.T)  
        
        trace = np.trace(C)
        if trace > 0:
            R = 1e-3 * trace
        else:
            R = 1e-3
        C.flat[:: len(nns) + 1] += R    
    
        w = solve(C, np.ones(len(nns)), assume_a="pos")  
        W[i, nns] = w / np.sum(w)
    
    Y_kstar_test = np.dot(W, Y_kstar)
    
    preds_clf = clf.predict(Y_kstar_test)
    
    acc = accuracy_score(y_test, preds_clf)
    
    accuracy_llestar.append(acc)

    
    print(f"  Accuracy: {acc:.4f}")




Fold 1/3
  Train size: 6666, Test size: 3334
  Calculate K-star LLE...
iteration  0
id  [13.06]
iteration  1
id  [11.45]
iteration  2
id  [11.35]
iteration  3
id  [11.35]
iteration  4
id  [11.36]
iteration  5
id  [11.36]
iteration  6
id  [11.36]
iteration  7
id  [11.36]
iteration  8
id  [11.36]
iteration  9
id  [11.36]
iteration  10
id  [11.36]
iteration  11
id  [11.36]
iteration  12
id  [11.36]
iteration  13
id  [11.36]
iteration  14
id  [11.36]
  ID optimal: 11.36, K-star median: 7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
    Campioni: 100%|█████████████████████████| 3334/3334 [24:16<00:00,  2.29it/s]


  Accuracy: 0.8743

Fold 2/3
  Train size: 6667, Test size: 3333
  Calculate K-star LLE...
iteration  0
id  [13.38]
iteration  1
id  [11.45]
iteration  2
id  [11.37]
iteration  3
id  [11.36]
iteration  4
id  [11.36]
iteration  5
id  [11.36]
iteration  6
id  [11.36]
iteration  7
id  [11.36]
iteration  8
id  [11.36]
iteration  9
id  [11.36]
iteration  10
id  [11.36]
iteration  11
id  [11.36]
iteration  12
id  [11.36]
iteration  13
id  [11.36]
iteration  14
id  [11.36]
  ID optimal: 11.36, K-star median: 6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
    Campioni: 100%|█████████████████████████| 3333/3333 [24:52<00:00,  2.23it/s]


  Accuracy: 0.9058

Fold 3/3
  Train size: 6667, Test size: 3333
  Calculate K-star LLE...
iteration  0
id  [13.57]
iteration  1
id  [11.6]
iteration  2
id  [11.44]
iteration  3
id  [11.47]
iteration  4
id  [11.47]
iteration  5
id  [11.48]
iteration  6
id  [11.47]
iteration  7
id  [11.48]
iteration  8
id  [11.47]
iteration  9
id  [11.47]
iteration  10
id  [11.48]
iteration  11
id  [11.47]
iteration  12
id  [11.48]
iteration  13
id  [11.47]
iteration  14
id  [11.48]
  ID optimal: 11.47, K-star median: 6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
    Campioni: 100%|█████████████████████████| 3333/3333 [24:40<00:00,  2.25it/s]

  Accuracy: 0.9037





In [6]:
print(np.mean(accuracy_llestar))

0.8946020276892717


In [7]:
import warnings


warnings.filterwarnings("ignore")


folds = 3
r = 'opt'
random_state = 0


accuracy_lle_same = []
predictions_no_hyper = []
predictions_same = []


kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)


for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
        
    ###no hyper
    lle = LocallyLinearEmbedding(random_state=0)
    X_train_lle = lle.fit_transform(X_train)
    
    clf_lle = LogisticRegression(n_jobs=-1, random_state=0, penalty = None)
    clf_lle.fit(X_train_lle, y_train)
    
    X_test_lle = lle.transform(X_test)
    
    preds_clf_lle = clf_lle.predict(X_test_lle)
    
    predictions_no_hyper.append(preds_clf_lle)
    accuracy_lle_no_hyper.append(accuracy_score(y_test, preds_clf_lle))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [8]:
print(np.mean(accuracy_lle_no_hyper))

0.6046043116609201
