In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import MDS, Isomap
from sklearn.manifold import LocallyLinearEmbedding as LLE
import glob
from PIL import Image
import cv2
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import warnings

# Methods definitions

In [2]:
def perform_pca(data, number_of_components=None, scale=False):
    
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)

    pca = PCA(n_components=number_of_components)
    components = pca.fit_transform(data)
    components_dataframe = pd.DataFrame(data=components)
    
    return components_dataframe, pca

def perform_kernel_pca(data, number_of_components=None, scale=False):
    
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)

    kpca = KernelPCA(kernel="rbf", n_components=number_of_components)
    components = kpca.fit_transform(data)
    components_dataframe = pd.DataFrame(data=components)
    
    return components_dataframe, kpca

def perform_mds(data, number_of_components, scale=False):
    # performing mds
    #MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, 
    #eps=0.001, n_jobs=None, random_state=None, dissimilarity=’euclidean’)[source]
    
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    
    mds = MDS(n_components=number_of_components)
    components = mds.fit_transform(data)
    components_dataframe = pd.DataFrame(data=components)
    return components_dataframe, mds

def perform_isomap(data, number_of_neighbors, number_of_components, scale=False):
    
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    
    iso = Isomap(n_neighbors=number_of_neighbors,n_components=number_of_components)
    components = iso.fit_transform(data)
    components_dataframe = pd.DataFrame(data=components)
    return components_dataframe, iso

def perform_lle(data, number_of_neighbors, number_of_components, scale=False):
    
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    
    lle = LLE(n_neighbors=number_of_neighbors,n_components=number_of_components)
    components = lle.fit_transform(data)
    components_dataframe = pd.DataFrame(data=components)
    return components_dataframe, lle

# Display functions

In [3]:
def display_data_with_colors(data, method_num=1, doScale=False,num_of_neighbors=5, dims=2, plot_kwargs={}):
    if method_num == 1:
        components_dataframe, methodUsed = perform_pca(data, scale=doScale)
        plt.title('PCA')
    elif method_num == 2:
        components_dataframe, methodUsed = perform_kernel_pca(data, scale=doScale)
        plt.title('Kernel PCA')
    elif method_num == 3:
        components_dataframe, methodUsed = perform_mds(data,number_of_components=dims, scale=doScale)
        plt.title('MDS')
    elif method_num == 4:
        components_dataframe, methodUsed = perform_isomap(data,num_of_neighbors, number_of_components=dims, scale=doScale)
        plt.title('Isomap')
    else:
        components_dataframe, methodUsed = perform_lle(data, num_of_neighbors, number_of_components=dims,scale=doScale)
        plt.title('LLE')
        
    if dims == 2:       
        if method_num != 1:
            plt.close()
        plt.scatter(components_dataframe[0], components_dataframe[1], **plot_kwargs)         
        plt.show()
    
    elif dims == 3:
        plt.close()
        ax = plt.axes(projection='3d')           
        ax.scatter3D(components_dataframe[0], components_dataframe[1], components_dataframe[2], **plot_kwargs);  
        plt.show()
    return components_dataframe, methodUsed

# Data preparation

In [4]:
data = pd.read_csv('datasets/Reuters_56k_PCA.csv')
data = data.sample(1500)
class_list = list(data.iloc[:,-1])
data.head()

Unnamed: 0,0.227411,0.0025741,-0.0825941,0.269994,-0.0793484,-0.40518,-0.71788,0.895774,0.0975026,0.156407,...,-0.162193,-0.43066,-0.24929,0.0299884,0.0990274,-0.444548,-0.13694,0.00377834,0.156475,C12
31162,0.072131,-0.584601,-0.025257,0.631383,-0.10406,-0.529088,-0.656113,0.79285,0.292663,0.411498,...,-0.331064,-0.344676,0.355008,-0.043083,0.395579,0.164662,-0.378728,-0.294894,0.147981,ECAT
49878,0.89367,-0.955136,-0.200345,-3.64907,0.445539,-0.278775,-0.364309,-0.463821,0.372447,-0.19403,...,0.083519,0.116592,-0.687323,0.14284,0.877762,0.227302,-0.21061,0.0813,-0.422372,M143
19078,-1.02974,-0.510967,0.166284,0.310524,-0.779703,-0.222953,-0.283479,0.634677,-0.106303,-0.039775,...,0.087637,-0.426125,0.988435,0.009068,0.136909,0.319947,-0.169158,0.026702,0.473382,E212
744,0.139193,-0.082092,-0.003165,0.267693,-0.217799,-0.222434,-0.657631,0.664895,0.265299,0.529308,...,-0.014347,-0.178547,-0.291711,0.036729,0.116982,-0.284864,0.001263,0.024503,0.092117,C12
19694,-2.04953,-0.41439,0.025681,0.322857,-4.69258,-1.92645,-1.54744,-1.12737,0.407975,-0.300699,...,0.036127,-0.045125,-0.340703,-0.055835,0.064967,0.191375,0.067467,-0.652097,-0.308477,E212


In [5]:
categoriesOrigin = data.T.tail(1)
categoriesOrigin = categoriesOrigin.iloc[0].tolist()
categories = list(set(categoriesOrigin))
data = data.iloc[:, :-1]

In [6]:
colors_dict = {
    '  E212': 'red',
    '  ECAT': 'blue',
    '  C12': 'green',
     '  GDIS': 'black',
    '  M143': 'orange',
    '  G154': 'purple',
    '  M131': 'grey',
    '  C151': 'brown'
    
}

colors = [colors_dict[c] for c in categoriesOrigin]
plot_setting = {
    's': 3,
    'c': colors
}

# Display without scaling (change arguments for scaled)

In [7]:
%matplotlib notebook
components, pca = display_data_with_colors(data,method_num=1, dims=2, plot_kwargs=plot_setting) #pca
#display_data_with_colors(data,method_num=1, dims=2, doScale=True, plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [8]:
components, pca = display_data_with_colors(data,method_num=1, dims=3, plot_kwargs=plot_setting) #pca
#display_data_with_colors(data,method_num=1, dims=3, doScale=True, plot_kwargs=plot_setting) #pca

<IPython.core.display.Javascript object>

In [9]:
components, kpca = display_data_with_colors(data,method_num=2, dims=2,plot_kwargs=plot_setting) #kernel pca
#display_data_with_colors(data,method_num=2, dims=2, doScale=True, plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [10]:
components, kpc = display_data_with_colors(data,method_num=2, dims=3, plot_kwargs=plot_setting) #kernel pca
#display_data_with_colors(data,method_num=2, dims=3, doScale=True, plot_kwargs=plot_setting) #kernel pca

<IPython.core.display.Javascript object>

In [11]:
components, mds = display_data_with_colors(data,method_num=3, dims=2,plot_kwargs=plot_setting) #mds
#display_data_with_colors(data,method_num=3, dims=2,doScale=True,plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [12]:
components, mds = display_data_with_colors(data,method_num=3, dims=3, plot_kwargs=plot_setting) #mds
#display_data_with_colors(data,method_num=3, dims=3, doScale=True, plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [13]:
components, isomap = display_data_with_colors(data,method_num=4, dims=2,plot_kwargs=plot_setting) #isomap
#display_data_with_colors(data,method_num=4, dims=2,doScale=True, plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [14]:
components, isomap = display_data_with_colors(data,method_num=4, dims=3, plot_kwargs=plot_setting) #isomap
#display_data_with_colors(data,method_num=4, dims=3, doScale=True, plot_kwargs=plot_setting) #scaled

<IPython.core.display.Javascript object>

In [15]:
components, lle = display_data_with_colors(data,method_num=5, dims=2,plot_kwargs=plot_setting) #lle

<IPython.core.display.Javascript object>

In [16]:
components, lle = display_data_with_colors(data,method_num=5, dims=3, plot_kwargs=plot_setting) #lle

<IPython.core.display.Javascript object>

# K nearest neighbours

In [26]:
def knn_stats(samples, labels, n_neighbours):
    nbrs = NearestNeighbors(n_neighbours, algorithm='brute').fit(samples)
    distances, indices = nbrs.kneighbors(samples)
    
    neighbour_stats = []
    
    for index, neighbours in enumerate(indices):
        same_cluster_neighbours = filter(
            lambda neighbour: labels[neighbour] == labels[index],
            neighbours
        )
        neighbour_stats.append(len(list(same_cluster_neighbours))/n_neighbours)
        
    return neighbour_stats

In [18]:
components_pca, pca = perform_pca(data, 2)
components_kpca, kpca = perform_kernel_pca(data, 2)
components_mds, mds = perform_mds(data, 2)
components_iso, iso = perform_isomap(data, 2, 5)
components_lle, lle = perform_lle(data, 5, 2)

In [27]:
nbors = 15
stats = knn_stats(components_pca, class_list, nbors)
print('pca: ', "%.0f%%" % (100*np.mean(stats)))

stats = knn_stats(components_kpca, class_list, nbors)
print('kpca: ', "%.0f%%" % (100*np.mean(stats)))

stats = knn_stats(components_mds, class_list, nbors)
print('mds: ', "%.0f%%" % (100*np.mean(stats)))

stats = knn_stats(components_iso, class_list, nbors)
print('isomap: ', "%.0f%%" % (100*np.mean(stats)))

stats = knn_stats(components_lle, class_list, nbors)
print('lle: ', "%.0f%%" % (100*np.mean(stats)))

pca:  67%
kpca:  59%
mds:  67%
isomap:  74%
lle:  62%


In [20]:
def k_nearest_neighbors(entry_dataset, computed_dataset, n_neighbors):
    
    nbrs_before = NearestNeighbors(n_neighbors, algorithm='brute').fit(entry_dataset)
    distances_before, indices_before = nbrs_before.kneighbors(entry_dataset)
    
    nbrs_after = NearestNeighbors(n_neighbors, algorithm='brute').fit(computed_dataset)
    distances_after, indices_after = nbrs_after.kneighbors(computed_dataset)
    
    neighbors_hit_list = []
    
    n = len(indices_before)
    
    for i in range(n):
        set_before= set(indices_before[i])
        set_after = set(indices_after[i])
        neighbors_hit_list.append(len(set_before.intersection(set_after)))
        
    return sum(neighbors_hit_list) / len(neighbors_hit_list) 

In [21]:
nbors = 15
stats = k_nearest_neighbors(data,components_pca, nbors)
print('pca: ', stats, ' out of ', nbors, ' neighbors')

stats = k_nearest_neighbors(data,components_kpca, nbors)
print('kpca: ', stats, ' out of ', nbors, ' neighbors')

stats = k_nearest_neighbors(data,components_mds, nbors)
print('mds: ', stats, ' out of ', nbors, ' neighbors')

stats = k_nearest_neighbors(data,components_iso, nbors)
print('isomap: ', stats, ' out of ', nbors, ' neighbors')

stats = k_nearest_neighbors(data,components_lle, nbors)
print('lle: ', stats, ' out of ', nbors, ' neighbors')

pca:  3.001333333333333  out of  15  neighbors
kpca:  2.962  out of  15  neighbors
mds:  4.023333333333333  out of  15  neighbors
isomap:  6.16  out of  15  neighbors
lle:  3.033333333333333  out of  15  neighbors
