In [9]:
import matplotlib
import matplotlib.pyplot as plt
import scipy.io
from skfeature.function.similarity_based import lap_score
from skfeature.function.similarity_based import SPEC
from skfeature.function.sparse_learning_based import MCFS
from skfeature.function.sparse_learning_based import NDFS
from skfeature.function.sparse_learning_based import UDFS
from skfeature.function.statistical_based import low_variance
from skfeature.utility.sparse_learning import feature_ranking
from skfeature.utility import construct_W
from skfeature.utility import unsupervised_evaluation
import pandas as pd
import numpy as np
import re
from time import time
matplotlib.use('qt4agg')
from skfeature.utility import construct_W
from skfeature.utility import unsupervised_evaluation
from sklearn import metrics

In [11]:
def lapScore(filename, X, y, Xselected, Nclusters):
    
    # construct affinity matrix
    kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs_W)

    t0 = time()
    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)
    print filename+"_lapScore_scoring_time elapsed: %.2fs" % (time() - t0)
    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)
    df = pd.DataFrame(idx)
    df = df.assign(score=score)
    np.savetxt(filename + "_Ranking_LapScore.csv", df, delimiter=",")
    

    t1 = time()
    NMI = pd.Series()
    ACC = pd.Series()
    nmi_total = 0
    acc_total = 0
    # perform kmeans clustering based on the selected features and repeats ### times
    for i in range(0, Xselected-Nclusters):
        selected_features = X[:, idx[0:i+Nclusters]]
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=Nclusters, y=y)
        NMI = NMI.append(pd.Series(float(nmi)), ignore_index=True)
        ACC = ACC.append(pd.Series(float(acc)), ignore_index=True)
        nmi_total += nmi
        acc_total += acc
    print filename+"_laplace_evaluation_time elapsed: %.2fs" % (time() - t1)
    print 'AVG ACC:', float(acc_total)/Xselected
    print 'AVG NMI:', float(nmi_total)/Xselected
    
    
    plt.figure(1)
    plt.plot(NMI, label='laplace')
    plt.figure(2)
    plt.plot(ACC, label='laplace')
    
    np.savetxt(filename+"_NMI_laplace.csv", NMI, delimiter=",")
    np.savetxt(filename+"_ACC_laplace.csv", ACC, delimiter=",")

In [12]:
def spec(filename, X, y, Xselected, Nclusters):
    
    # specify the second ranking function which uses all except the 1st eigenvalue
    kwargs = {'style': 0}

    # obtain the scores of features
    t0 = time()
    score = SPEC.spec(X, **kwargs)
    print filename+"_spec_scoring_time elapsed: %.2fs" % (time() - t0)
    # sort the feature scores in an ascending order according to the feature scores
    idx = SPEC.feature_ranking(score, **kwargs)
    df = pd.DataFrame(idx)
    df = df.assign(score=score)
    np.savetxt(filename + "_Ranking_spec.csv", df, delimiter=",")
    
    
    t1 = time()
    NMI = pd.Series()
    ACC = pd.Series()
    nmi_total = 0
    acc_total = 0
    # perform kmeans clustering based on the selected features and repeats ### times
    for i in range(0, Xselected-Nclusters):
        selected_features = X[:, idx[0:i+Nclusters]]
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=Nclusters, y=y)
        NMI = NMI.append(pd.Series(float(nmi)), ignore_index=True)
        ACC = ACC.append(pd.Series(float(acc)), ignore_index=True)
        nmi_total += nmi
        acc_total += acc
    print filename+"_spec_evaluation_time elapsed: %.2fs" % (time() - t1)
    print 'AVG ACC:', float(acc_total)/Xselected
    print 'AVG NMI:', float(nmi_total)/Xselected
    
        
    plt.figure(1)
    plt.plot(NMI, label='spec')
    plt.figure(2)
    plt.plot(ACC, label='spec')
    
    np.savetxt(filename+"_NMI_spec.csv", NMI, delimiter=",")
    np.savetxt(filename+"_ACC_spec.csv", ACC, delimiter=",")
    

In [13]:
def mcfs(filename, X, y, Xselected, Nclusters):
    
    # construct affinity matrix
    kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs)

    # obtain the feature weight matrix
    t0 = time()
    Weight = MCFS.mcfs(X, n_selected_features=Xselected, W=W, n_clusters=Nclusters)
    print filename+"_mcfs_scoring_time elapsed: %.2fs" % (time() - t0)
    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)
#    df = pd.DataFrame(idx)
#    df = df.assign(weight=Weight)
#    np.savetxt(filename + "_Ranking_mcfs.csv", df, delimiter=",")
    
    
    t1 = time()
    NMI = pd.Series()
    ACC = pd.Series()
    nmi_total = 0
    acc_total = 0
    # perform kmeans clustering based on the selected features and repeats ### times
    for i in range(0, Xselected-Nclusters):
        selected_features = X[:, idx[0:i+Nclusters]]
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=Nclusters, y=y)
        NMI = NMI.append(pd.Series(float(nmi)), ignore_index=True)
        ACC = ACC.append(pd.Series(float(acc)), ignore_index=True)
        nmi_total += nmi
        acc_total += acc
    print filename+"_mcfs_evaluation_time elapsed: %.2fs" % (time() - t1)
    print 'AVG ACC:', float(acc_total)/Xselected
    print 'AVG NMI:', float(nmi_total)/Xselected
    
        
    plt.figure(1)
    plt.plot(NMI, label='mcfs')
    plt.figure(2)
    plt.plot(ACC, label='mcfs')
    
    np.savetxt(filename+"_NMI_mcfs.csv", NMI, delimiter=",")
    np.savetxt(filename+"_ACC_mcfs.csv", ACC, delimiter=",")
    

In [16]:
def tda(filename, X, y, Xselected, Nclusters):
    
    csv = pd.read_csv('../data/TDA_'+filename+'.csv')
    noLabel = csv.loc[1:,:]
    #filtered = noLabel.loc[noLabel[' P-value'] < 2]
    replaced = noLabel['Column Name'].map(lambda x: re.sub(r'Column_', '', x))
    idx = pd.to_numeric(replaced).values
    
    t1 = time()
    NMI = pd.Series()
    ACC = pd.Series()
    nmi_total = 0
    acc_total = 0
    # perform kmeans clustering based on the selected features and repeats ### times
    for i in range(0, Xselected-Nclusters):
        selected_features = X[:, idx[0:i+Nclusters]]
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=Nclusters, y=y)
        NMI = NMI.append(pd.Series(float(nmi)), ignore_index=True)
        ACC = ACC.append(pd.Series(float(acc)), ignore_index=True)
        nmi_total += nmi
        acc_total += acc
    print filename+"_tda_evaluation_time elapsed: %.2fs" % (time() - t1)
    print 'AVG ACC:', float(acc_total)/Xselected
    print 'AVG NMI:', float(nmi_total)/Xselected
    
    
    plt.figure(1)
    plt.plot(NMI, label="TDA", markevery=10)   
    plt.figure(2) 
    plt.plot(ACC, label="TDA", markevery=10)

    
    np.savetxt(filename+"_NMI_tda.csv", NMI, delimiter=",")
    np.savetxt(filename+"_ACC_tda.csv", ACC, delimiter=",")

# MAIN

In [None]:
# setting for loading, modeling and evaluation
num_fea = [210, 220, 240, 202, 203, 204, 207, 209, 211, 202, 202, 202]        
n_clusters = [10, 20, 40, 2, 3, 4, 7, 9, 11, 2, 2, 2]
filename = ["USPS","COIL20","ORL", "ALLAML","CLL_SUB_111","TOX_171","lung_discrete","lymphoma", "Carcinom","BASEHOCK", "PCMAC", "RELATHE"]


# linear execution based on the data file
for i in range(0, len(filename)):      

    mat = scipy.io.loadmat('../data/' + filename[i] + '.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    lapScore(filename[i], X, y, num_fea[i], n_clusters[i])
    spec(filename[i], X, y, num_fea[i], n_clusters[i])
    mcfs(filename[i], X, y, num_fea[i], n_clusters[i])
    tda(filename[i], X, y, num_fea[i], n_clusters[i])

    plt.figure(1)
    plt.xlabel("Number of features")
    plt.ylabel("Normalized Mutual Information Score")
    plt.figure(2)
    plt.xlabel("Number of features")
    plt.ylabel("Accuracy")

    plt.figure(1)
    plt.ylim(0,0.9)
    plt.title(filename[i]+'_NMI')
    plt.savefig(filename[i] +'_NMI.png', dpi=200)
    plt.figure(2)
    plt.ylim(0,0.9)
    plt.title(filename[i]+'_ACC')
    plt.savefig(filename[i] +'_ACC.png', dpi=200)

    plt.close('all')

USPS_lapScore_scoring_time elapsed: 1.06s
USPS_laplace_evaluation_time elapsed: 458.29s
AVG ACC: 0.41418789499
AVG NMI: 0.391502618532
USPS_spec_scoring_time elapsed: 253.46s


# Print data in the memory when unexpected error occurs

In [24]:
plt.figure(1)
plt.xlabel("Number of features")
plt.ylabel("Normalized Mutual Information Score")
plt.figure(2)
plt.xlabel("Number of features")
plt.ylabel("Accuracy")

plt.figure(1)
plt.ylim(0,0.9)
plt.title('Carcinom_NMI')
plt.savefig('Carcinom_NMI.png', dpi=200)
plt.figure(2)
plt.ylim(0,0.9)
plt.title('Carcinom_ACC')
plt.savefig('Carcinom_ACC.png', dpi=200)

plt.close('all')