In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("NIPS/*_train.csv")
print(class_train)

['NIPS\\arcene_train.csv', 'NIPS\\dexter_train.csv', 'NIPS\\dorothea_train.csv', 'NIPS\\gisette_train.csv', 'NIPS\\madelon_train.csv']


In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, mod_name):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if mod_name == 'Base':
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
        
        # debugging
        print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
        if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
            print("\tmodel: {}, iter: {}".format(mod_name, idx))
        
        # for fast osfs
    if mod_name == 'Fast_OSFS':
        mod._redundancy(train1, y, mode='all')
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
def create_models():
    return [
    ('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    #('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
    ('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
#ex_dat = class_train[0]
#print(ex_dat, pd.read_csv(ex_dat).shape)
#models = create_models()
#for nm, mod in models:
#    print(nm, get_performance(mod, ex_dat, mod_name=nm))

In [None]:
ex_dat = class_train[1]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))

NIPS\dexter_train.csv (300, 20000)
Grafting {'accuracy': 1.0, 'logloss': 0.0040602029275617216, 'feat_dim': (159,)}


  f = msb / msw


	model: Fast_OSFS, iter: 0










	model: Fast_OSFS, iter: 201








	model: Fast_OSFS, iter: 402








	model: Fast_OSFS, iter: 603






	model: Fast_OSFS, iter: 804






	model: Fast_OSFS, iter: 1005






	model: Fast_OSFS, iter: 1206






	model: Fast_OSFS, iter: 1407






	model: Fast_OSFS, iter: 1608




	model: Fast_OSFS, iter: 1809




		(300, 268)
		(300, 267)
		(300, 266)
		(300, 265)
		(300, 264)
		(300, 263)
		(300, 262)
		(300, 261)
		(300, 260)
		(300, 259)
		(300, 258)
		(300, 257)
		(300, 256)
		(300, 255)
		(300, 254)
		(300, 253)
		(300, 252)
		(300, 251)
		(300, 250)
		(300, 249)
		(300, 248)
		(300, 247)
		(300, 246)
		(300, 245)
		(300, 245)
		(300, 244)
		(300, 243)
		(300, 242)
		(300, 241)
		(300, 241)
		(300, 240)
		(300, 239)
		(300, 238)
		(300, 237)
		(300, 236)
		(300, 235)
		(300, 234)
		(300, 233)
		(300, 232)
		(300, 231)
		(300, 231)
		(300, 230)
		(300, 229)
		(300, 228)
		(300, 227)
		(300, 226)
		(300, 225)
		(300, 224)
		(300, 224)
		(300, 223)
		(300, 222)
		(300, 221)
		(300, 220)
		(300, 219)
		(300, 218)
		(300, 217)
		(300, 216)
		(300, 215)
		(300, 214)
		(300, 214)
		(300, 213)


  cor = -V_inv[0][1]/(np.sqrt(V_inv[0][0]*V_inv[1][1]))
  cor_m = np.minimum(cor_m, 0.9999)
  cor_m = np.maximum(cor_m, -0.9999)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


		(300, 213)
		(300, 212)
		(300, 212)
		(300, 211)
		(300, 210)
		(300, 209)
		(300, 208)
		(300, 207)
		(300, 206)
		(300, 205)
		(300, 204)
		(300, 203)


In [None]:
#ex_dat = class_train[2]
#print(ex_dat, pd.read_csv(ex_dat).shape)
#models = create_models()
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [None]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))

In [None]:
ex_dat = class_train[4]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))