In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.metrics import mean_squared_error
from scipy.spatial import distance
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd
import matplotlib.gridspec as gridspec
import numpy as np
from sklearn.model_selection import cross_validate, cross_val_score
import coranking
from coranking.metrics import trustworthiness, continuity

In [2]:
import struct
import numpy as np
from sklearn.utils import shuffle

def load_mnist(n_samples=1000):
    ''' Gets the MNIST dataset. Returns a tuple (data, target) containing the dataset and the labels.

        data:   The 1000 x 784 data matrix containing the images. (n_samples = 1000, n_features =784)
        target: The 1000 x 1 label vector containing the labels for the images

        http://yann.lecun.com/exdb/mnist/
        
        NOTE: The dataset consists of 60.000 training images and 10.000 test images.
    '''
    # 1) Download at least the two training .gz from http://yann.lecun.com/exdb/mnist/
    # 2) Don't rename them
    # 3) Unpack them to the path 'Datasets/MNIST/'
    X_train_all = read_idx('data/train-images.idx3-ubyte') # load training images
    X_train_all = np.reshape(X_train_all, (60000, 784))
    Y_train_all = read_idx('data/train-labels.idx1-ubyte') # load training labels
    X_train, Y_train = shuffle(X_train_all, Y_train_all, n_samples=n_samples, random_state=1)
    return X_train, Y_train

def read_idx(filename):
    """ A function that can read MNIST's idx file format into numpy arrays.
        Credits to https://gist.github.com/tylerneylon/ce60e8a06e7506ac45788443f7269e40
    """
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

In [13]:
X, y = load_mnist(n_samples=5000)

In [14]:
def NN_generalization_error(X, labels):
    '''Train 1-NN classifier on the training set of the low-dim data,
    and evaluate on the test set to get the error (1-accuracy).
    '''
    clf = KNeighborsClassifier(n_neighbors=1)
    # 5-fold cross-validation averaged accuracy
    avg_accuracy = cross_val_score(clf, X[:, :2], labels, cv=5).mean()
    error = 1 - avg_accuracy
    return error

In [15]:
def discretize_labels(sort_values, bins=5):
    bin_edges = np.linspace(sort_values.min(), sort_values.max(), bins)
    bin_edges[-1] += 1e-5  # Ajouter un petit écart à la limite supérieure pour éviter d'isoler un point
    return np.digitize(sort_values, bin_edges)

In [16]:
def apply_and_evaluate(X, _):
    # Initial setup
    results = []

    # Trier les données selon la troisième colonne
    sorted_indices = np.argsort(_)
    X = X = X[sorted_indices]

    # Discretiser les étiquettes après le tri
    y = discretize_labels(_[sorted_indices])
    
    # Apply PCA
    pca = PCA(n_components=20)
    X_pca = pca.fit_transform(X)
    
    # Evaluate k-NN with PCA
    pca_error = NN_generalization_error(X_pca, y)

    # Apply Isomap with GridSearchCV
    isomap_knn_pipeline = Pipeline([
        ('isomap', Isomap(n_components=20)),
        ('knn', KNeighborsClassifier(n_neighbors=1))
    ])
    param_grid = {'isomap__n_neighbors': [10, 15, 20, 25, 30, 35, 40, 45, 50]}
    grid_search = GridSearchCV(isomap_knn_pipeline, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, y)
    best_n_neighbors = grid_search.best_params_['isomap__n_neighbors']
    best_isomap = Isomap(n_components=2, n_neighbors=best_n_neighbors)
    X_isomap = best_isomap.fit_transform(X)
    
    # Evaluate k-NN with best Isomap
    isomap_error = NN_generalization_error(X_isomap, y)
    
    # Trustworthiness and Continuity for PCA
    Q_pca = coranking.coranking_matrix(X, X_pca)
    trustworthiness_pca = trustworthiness(Q_pca, min_k=12, max_k=13)[0]
    continuity_pca = continuity(Q_pca,min_k=12, max_k=13)[0]
    results.append(['PCA', pca_error, trustworthiness_pca, continuity_pca])

    # Trustworthiness and Continuity for Isomap
    Q_isomap = coranking.coranking_matrix(X, X_isomap)
    trustworthiness_isomap = trustworthiness(Q_isomap, min_k=12, max_k=13)[0]
    continuity_isomap = continuity(Q_isomap, min_k=12, max_k=13)[0]
    results.append(['Isomap', isomap_error, trustworthiness_isomap, continuity_isomap])

    # Create DataFrame
    df = pd.DataFrame(results, columns=["Model", "Classification Error", "Trustworthiness", "Continuity"])

    return df

In [18]:
apply_and_evaluate(X,y)

Unnamed: 0,Model,Classification Error,Trustworthiness,Continuity
0,PCA,0.5392,0.99758,0.998778
1,Isomap,0.5046,0.759299,0.939698
