In [2]:
import scipy.io
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold, cross_val_score
from sklearn import svm
from feature_selection.inf_fs import inf_fs, select_inf_fs

In [3]:
!ls ./data

USPS.mat   leukemia.mat  lung_small.mat  madelon.mat
colon.mat  lung.mat	 lymphoma.mat


## Load dataset

In [4]:
mat = scipy.io.loadmat('./data/colon.mat')

X = mat['X'].astype(float)
y = mat['Y'][:, 0]

# normalization
X = (X-X.min(axis=0))/(X.max(axis=0) - (X.min(axis=0)))

## Evaluation

In [5]:
num_features = [10,50,100,150,200]

for num_feat in num_features:
    rank, score = inf_fs(X,alpha=0.5,factor=0.9)

In [7]:
score

array([11.21034513, 11.12323936, 11.06361462, ...,  7.7075323 ,
        7.65002108,  7.58629822])

In [38]:
num_features = [10,50,100,150,200]

for num_feat in num_features:
    X_sel = select_inf_fs(X, num_feat, alpha=0.5)

    clf = svm.SVC()
    ifs_auc = cross_val_score(clf, X_sel, y, \
                              cv=KFold(n_splits=5, shuffle=True, random_state=0), \
                              scoring='roc_auc').mean()
    naive_auc =  cross_val_score(clf, X[:,:num_feat], y, \
                                 cv=KFold(n_splits=5, shuffle=True, random_state=0),\
                                 scoring='roc_auc').mean()
    print('AUC @', num_feat, 'inf-fs:', ifs_auc, 'naive:', naive_auc)

AUC @ 10 inf-fs: 0.8012698412698412 naive: 0.482063492063492
AUC @ 50 inf-fs: 0.8457142857142858 naive: 0.8196825396825396
AUC @ 100 inf-fs: 0.8801587301587303 naive: 0.8641269841269843
AUC @ 150 inf-fs: 0.9033333333333333 naive: 0.8361904761904763
AUC @ 200 inf-fs: 0.8976190476190474 naive: 0.8698412698412697


In [39]:
num_features = [10,50,100,150,200]

for num_feat in num_features:
    X_sel = select_inf_fs(X, num_feat, alpha=0.5)

    clf = svm.SVC()
    ifs_auc = cross_val_score(clf, X_sel, y, \
                              cv=KFold(n_splits=5, shuffle=True, random_state=0), \
                              scoring='accuracy').mean()
    naive_auc =  cross_val_score(clf, X[:,:num_feat], y, \
                                 cv=KFold(n_splits=5, shuffle=True, random_state=0),\
                                 scoring='accuracy').mean()
    print('Accuracy @', num_feat, 'inf-fs:', ifs_auc, 'naive:', naive_auc)

Accuracy @ 10 inf-fs: 0.8346153846153846 naive: 0.5935897435897436
Accuracy @ 50 inf-fs: 0.8179487179487179 naive: 0.7538461538461537
Accuracy @ 100 inf-fs: 0.8358974358974359 naive: 0.7884615384615385
Accuracy @ 150 inf-fs: 0.8358974358974359 naive: 0.7384615384615384
Accuracy @ 200 inf-fs: 0.8358974358974359 naive: 0.7551282051282051
