In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("uci/*_train.csv")
print(class_train)

['uci\\Ionosphere_train.csv', 'uci\\spambase_train.csv', 'uci\\spectf_train.csv', 'uci\\wdbc_train.csv']


In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
def create_models():
    return [
    ('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
# ex_dat = class_train[0]
# print(ex_dat, pd.read_csv(ex_dat).shape)
# models = create_models()
# for nm, mod in models:
#     if nm != 'Base':
#         print(nm, get_performance(mod, ex_dat))
#     else:
#         print(nm, get_performance(mod, ex_dat, base=True))

In [7]:
ex_dat = class_train[1]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

uci\spambase_train.csv (4601, 57)
Grafting {'accuracy': 0.92088676374701151, 'logloss': 0.2698326804341154, 'feat_dim': (50,)}


n_components was set to n_samples, which results in inefficient evaluation of the full kernel.
  d_inv = np.sqrt(np.diag(np.diag(K)))
  cor = -V_inv[0][1]/(np.sqrt(V_inv[0][0]*V_inv[1][1]))
  cor_m = np.minimum(cor_m, 0.9999)
  cor_m = np.maximum(cor_m, -0.9999)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0
  cor = -V_inv[0][1]/(np.sqrt(V_inv[0][0]*V_inv[1][1]))


OSFS {'accuracy': 0.77548359052379923, 'logloss': 0.53692952801399851, 'feat_dim': (6,)}
Base {'accuracy': 0.708324277331015, 'logloss': 10.070713136799997, 'feat_dim': (57,)}


  np.exp(prob, prob)


In [8]:
ex_dat = class_train[2]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

uci\spectf_train.csv (267, 44)
Grafting {'accuracy': 0.797752808988764, 'logloss': 0.5767922374772857, 'feat_dim': (37,)}
OSFS {'accuracy': 0.79400749063670417, 'logloss': 0.93774189105411632, 'feat_dim': (1,)}
Base {'accuracy': 0.79400749063670417, 'logloss': 7.1147292199254215, 'feat_dim': (44,)}


In [9]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

uci\wdbc_train.csv (569, 30)
Grafting {'accuracy': 0.94200351493848855, 'logloss': 0.19962258813204223, 'feat_dim': (24,)}
OSFS {'accuracy': 0.91739894551845347, 'logloss': 0.18703953784697067, 'feat_dim': (3,)}
Base {'accuracy': 0.91564147627416526, 'logloss': 2.9136401879713767, 'feat_dim': (30,)}


  np.exp(prob, prob)
