In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("uci/*_train.csv")
print(class_train)

['uci\\BreastCancer_train.csv', 'uci\\Ionosphere_train.csv', 'uci\\PimaIndiansDiabetes_train.csv']


In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results

    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [18]:
def create_models():
    return [
    ('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    ('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    ('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
ex_dat = class_train[2]
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

  np.exp(prob, prob)
  V_norm = V[choose_item, :]/Vj[choose_item]
  choose_item = np.random.choice(range(len(P_list)), 1, p=P_norm)[0]
  z = (T - mn - correction) / se
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Grafting {'accuracy': 0.65104166666666663, 'logloss': 12.052593846140708, 'feat_dim': (8,)}
DPP {'accuracy': 0.65494791666666663, 'logloss': 11.917676750848088, 'feat_dim': (4,)}
DPP2 {'accuracy': 0.64973958333333337, 'logloss': 12.097566211238245, 'feat_dim': (5,)}
DPP3 {'accuracy': 0.63411458333333337, 'logloss': 12.571086838711411, 'feat_dim': (7,)}
OGFS {'accuracy': 0.50520833333333337, 'logloss': 17.058408630870893, 'feat_dim': (6,)}
OSFS {'accuracy': 0.65494791666666663, 'logloss': 11.570228328980377, 'feat_dim': (3,)}
Base {'accuracy': 0.63541666666666663, 'logloss': 12.592262227311187, 'feat_dim': (8,)}


  z_score = 0.5*np.log((1+cor_m)/(1-cor_m))


In [14]:
ex_dat = class_train[0]
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

Grafting {'accuracy': 0.97138769670958514, 'logloss': 0.91856623906017976, 'feat_dim': (9,)}
DPP {'accuracy': 0.94706723891273248, 'logloss': 1.6185424629236025, 'feat_dim': (5,)}
DPP2 {'accuracy': 0.9570815450643777, 'logloss': 1.4133663631023548, 'feat_dim': (6,)}
DPP3 {'accuracy': 0.87267525035765381, 'logloss': 3.9070534432835231, 'feat_dim': (9,)}


  z = (T - mn - correction) / se
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


OGFS {'accuracy': 0.94992846924177399, 'logloss': 1.5791605522077283, 'feat_dim': (7,)}
OSFS {'accuracy': 0.90414878397711018, 'logloss': 2.998981862509567, 'feat_dim': (2,)}
Base {'accuracy': 0.96995708154506433, 'logloss': 0.9308724123193598, 'feat_dim': (9,)}


  z_score = 0.5*np.log((1+cor_m)/(1-cor_m))


In [17]:
ex_dat = class_train[1]
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

Grafting {'accuracy': 0.89743589743589747, 'logloss': 1.9542522745571684, 'feat_dim': (33,)}
DPP {'accuracy': 0.88319088319088324, 'logloss': 1.3414946864222554, 'feat_dim': (22,)}
DPP2 {'accuracy': 0.85470085470085466, 'logloss': 1.1251716041792057, 'feat_dim': (22,)}


  S_c2 = np.diag(s_b)/np.diag(s_w)   # OGFS (criterion2)
  X = s_b/s_w
  prev_score = np.sum(curr_u1)/np.sum(curr_u2)
  score = ((np.sum(test_u1)/np.sum(test_u2)) - prev_score)
  score = ((np.sum(test_u1)/np.sum(test_u2)) - prev_score)
  eval2 = s_b/s_w
  eval2 = np.diag(s_b)/np.diag(s_w)
  x = asanyarray(arr - arrmean)
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


DPP3 {'accuracy': 0.86609686609686609, 'logloss': 1.1260011462820467, 'feat_dim': (21,)}
OGFS {'accuracy': 0.89743589743589747, 'logloss': 1.2220482424845152, 'feat_dim': (25,)}


  z_score = 0.5*np.log((1+cor_m)/(1-cor_m))


OSFS {'accuracy': 0.88888888888888884, 'logloss': 1.2373781425976114, 'feat_dim': (21,)}
Base {'accuracy': 0.91737891737891741, 'logloss': 1.9154349277794036, 'feat_dim': (34,)}
