In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)

['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']


In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, mod_name):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if mod_name == 'Base':
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
        
        # debugging
        print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
        if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
            print("\tmodel: {}, iter: {}".format(mod_name, idx))
        
        # for fast osfs
    if mod_name == 'Fast_OSFS':
        mod._redundancy(train1, y, mode='all')
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
def create_models():
    return [
    ('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    #('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
    ('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))

microarray\prostate_train.csv (102, 12600)
Grafting {'accuracy': 1.0, 'logloss': 0.0015230494955966597, 'feat_dim': (153,)}
	model: Fast_OSFS, iter: 0
	model: Fast_OSFS, iter: 127
	model: Fast_OSFS, iter: 254
	model: Fast_OSFS, iter: 381
	model: Fast_OSFS, iter: 508
	model: Fast_OSFS, iter: 635
	model: Fast_OSFS, iter: 762
	model: Fast_OSFS, iter: 889
	model: Fast_OSFS, iter: 1016
	model: Fast_OSFS, iter: 1143
		(102, 5537)
		(102, 5536)
		(102, 5535)
		(102, 5534)
		(102, 5533)
		(102, 5532)
		(102, 5531)
		(102, 5530)
		(102, 5529)
		(102, 5528)
		(102, 5527)
		(102, 5526)
		(102, 5525)
		(102, 5524)
		(102, 5523)
		(102, 5522)
		(102, 5521)
		(102, 5520)
		(102, 5519)
		(102, 5518)
		(102, 5517)
		(102, 5516)
		(102, 5515)
		(102, 5514)
		(102, 5513)
		(102, 5512)
		(102, 5511)
		(102, 5510)
		(102, 5509)
		(102, 5508)
		(102, 5507)
		(102, 5506)
		(102, 5505)
		(102, 5504)
		(102, 5503)
		(102, 5502)
		(102, 5501)
		(102, 5500)
		(102, 5499)
		(102, 5498)
		(102, 5497)
		(102, 5496

		(102, 4986)
		(102, 4985)
		(102, 4984)
		(102, 4983)
		(102, 4982)
		(102, 4981)
		(102, 4980)
		(102, 4979)
		(102, 4978)
		(102, 4977)
		(102, 4976)
		(102, 4975)
		(102, 4974)
		(102, 4973)
		(102, 4972)
		(102, 4971)
		(102, 4970)
		(102, 4969)
		(102, 4968)
		(102, 4967)
		(102, 4966)
		(102, 4965)
		(102, 4964)
		(102, 4963)
		(102, 4962)
		(102, 4961)
		(102, 4960)
		(102, 4959)
		(102, 4958)
		(102, 4957)
		(102, 4956)
		(102, 4955)
		(102, 4954)
		(102, 4953)
		(102, 4952)
		(102, 4951)
		(102, 4950)
		(102, 4949)
		(102, 4948)
		(102, 4947)
		(102, 4946)
		(102, 4945)
		(102, 4944)
		(102, 4943)
		(102, 4942)
		(102, 4941)
		(102, 4940)
		(102, 4939)
		(102, 4938)
		(102, 4937)
		(102, 4936)
		(102, 4935)
		(102, 4934)
		(102, 4933)
		(102, 4932)
		(102, 4931)
		(102, 4930)
		(102, 4929)
		(102, 4928)
		(102, 4927)
		(102, 4926)
		(102, 4925)
		(102, 4924)
		(102, 4923)
		(102, 4922)
		(102, 4921)
		(102, 4920)
		(102, 4919)
		(102, 4918)
		(102, 4917)
		(102, 4916)
		(102

		(102, 4405)
		(102, 4404)
		(102, 4403)
		(102, 4402)
		(102, 4401)
		(102, 4400)
		(102, 4399)
		(102, 4398)
		(102, 4397)
		(102, 4396)
		(102, 4395)
		(102, 4394)
		(102, 4393)
		(102, 4392)
		(102, 4391)
		(102, 4390)
		(102, 4389)
		(102, 4388)
		(102, 4387)
		(102, 4386)
		(102, 4385)
		(102, 4384)
		(102, 4383)
		(102, 4382)
		(102, 4381)
		(102, 4380)
		(102, 4379)
		(102, 4378)
		(102, 4377)
		(102, 4376)
		(102, 4375)
		(102, 4374)
		(102, 4374)
		(102, 4373)
		(102, 4372)
		(102, 4371)
		(102, 4370)
		(102, 4369)
		(102, 4368)
		(102, 4367)
		(102, 4366)
		(102, 4365)
		(102, 4364)
		(102, 4363)
		(102, 4362)
		(102, 4361)
		(102, 4360)
		(102, 4359)
		(102, 4358)
		(102, 4357)
		(102, 4356)
		(102, 4355)
		(102, 4354)
		(102, 4353)
		(102, 4352)
		(102, 4351)
		(102, 4350)
		(102, 4349)
		(102, 4348)
		(102, 4347)
		(102, 4346)
		(102, 4345)
		(102, 4344)
		(102, 4343)
		(102, 4342)
		(102, 4341)
		(102, 4340)
		(102, 4339)
		(102, 4338)
		(102, 4337)
		(102, 4336)
		(102

		(102, 3822)
		(102, 3821)
		(102, 3820)
		(102, 3819)
		(102, 3818)
		(102, 3817)
		(102, 3816)
		(102, 3815)
		(102, 3814)
		(102, 3813)
		(102, 3812)
		(102, 3811)
		(102, 3810)
		(102, 3809)
		(102, 3808)
		(102, 3807)
		(102, 3806)
		(102, 3805)
		(102, 3804)
		(102, 3803)
		(102, 3802)
		(102, 3801)
		(102, 3800)
		(102, 3799)
		(102, 3798)
		(102, 3797)
		(102, 3796)
		(102, 3795)
		(102, 3794)
		(102, 3793)
		(102, 3792)
		(102, 3791)
		(102, 3790)
		(102, 3789)
		(102, 3788)
		(102, 3787)
		(102, 3786)
		(102, 3785)
		(102, 3784)
		(102, 3783)
		(102, 3782)
		(102, 3781)
		(102, 3780)
		(102, 3779)
		(102, 3778)
		(102, 3777)
		(102, 3776)
		(102, 3775)
		(102, 3774)
		(102, 3773)
		(102, 3772)
		(102, 3771)
		(102, 3770)
		(102, 3769)
		(102, 3768)
		(102, 3767)
		(102, 3766)
		(102, 3765)
		(102, 3764)
		(102, 3763)
		(102, 3762)
		(102, 3761)
		(102, 3760)
		(102, 3759)
		(102, 3758)
		(102, 3757)
		(102, 3756)
		(102, 3756)
		(102, 3755)
		(102, 3754)
		(102, 3753)
		(102

		(102, 3247)
		(102, 3246)
		(102, 3245)
		(102, 3244)
		(102, 3243)
		(102, 3242)
		(102, 3241)
		(102, 3240)
		(102, 3239)
		(102, 3238)
		(102, 3237)
		(102, 3236)
		(102, 3235)
		(102, 3234)
		(102, 3233)
		(102, 3232)
		(102, 3231)
		(102, 3230)
		(102, 3229)
		(102, 3228)
		(102, 3228)
		(102, 3227)
		(102, 3226)
		(102, 3225)
		(102, 3224)
		(102, 3223)
		(102, 3222)
		(102, 3221)
		(102, 3220)
		(102, 3219)
		(102, 3218)
		(102, 3217)
		(102, 3216)
		(102, 3215)
		(102, 3214)
		(102, 3213)
		(102, 3212)
		(102, 3211)
		(102, 3210)
		(102, 3209)
		(102, 3208)
		(102, 3207)
		(102, 3206)
		(102, 3205)
		(102, 3204)
		(102, 3203)
		(102, 3202)
		(102, 3201)
		(102, 3200)
		(102, 3199)
		(102, 3198)
		(102, 3197)
		(102, 3196)
		(102, 3195)
		(102, 3194)
		(102, 3193)
		(102, 3192)
		(102, 3191)
		(102, 3190)
		(102, 3189)
		(102, 3188)
		(102, 3187)
		(102, 3186)
		(102, 3185)
		(102, 3184)
		(102, 3183)
		(102, 3182)
		(102, 3181)
		(102, 3180)
		(102, 3179)
		(102, 3178)
		(102

		(102, 2663)
		(102, 2662)
		(102, 2661)
		(102, 2660)
		(102, 2659)
		(102, 2658)
		(102, 2657)
		(102, 2656)
		(102, 2655)
		(102, 2654)
		(102, 2653)
		(102, 2652)
		(102, 2651)
		(102, 2650)
		(102, 2649)
		(102, 2648)
		(102, 2647)
		(102, 2646)
		(102, 2645)
		(102, 2644)
		(102, 2644)
		(102, 2643)
		(102, 2642)
		(102, 2641)
		(102, 2640)
		(102, 2639)
		(102, 2638)
		(102, 2637)
		(102, 2636)
		(102, 2635)
		(102, 2634)
		(102, 2633)
		(102, 2632)
		(102, 2631)
		(102, 2630)
		(102, 2629)
		(102, 2628)
		(102, 2627)
		(102, 2626)
		(102, 2625)
		(102, 2624)
		(102, 2623)
		(102, 2622)
		(102, 2621)
		(102, 2620)
		(102, 2619)
		(102, 2618)
		(102, 2617)
		(102, 2616)
		(102, 2615)
		(102, 2614)
		(102, 2613)
		(102, 2612)
		(102, 2611)
		(102, 2610)
		(102, 2609)
		(102, 2608)
		(102, 2607)
		(102, 2606)
		(102, 2605)
		(102, 2604)
		(102, 2603)
		(102, 2602)
		(102, 2601)
		(102, 2600)
		(102, 2599)
		(102, 2598)
		(102, 2597)
		(102, 2596)
		(102, 2595)
		(102, 2594)
		(102

		(102, 2080)
		(102, 2079)
		(102, 2078)
		(102, 2077)
		(102, 2076)
		(102, 2076)
		(102, 2075)
		(102, 2074)
		(102, 2073)
		(102, 2072)
		(102, 2071)
		(102, 2070)
		(102, 2069)
		(102, 2068)
		(102, 2067)
		(102, 2066)
		(102, 2065)
		(102, 2064)
		(102, 2063)
		(102, 2062)
		(102, 2061)
		(102, 2060)
		(102, 2059)
		(102, 2058)
		(102, 2057)
		(102, 2056)
		(102, 2055)
		(102, 2054)
		(102, 2053)
		(102, 2052)
		(102, 2051)
		(102, 2050)
		(102, 2049)
		(102, 2048)
		(102, 2047)
		(102, 2046)
		(102, 2045)
		(102, 2044)
		(102, 2043)
		(102, 2042)
		(102, 2041)
		(102, 2040)
		(102, 2039)
		(102, 2038)
		(102, 2037)
		(102, 2036)
		(102, 2035)
		(102, 2034)
		(102, 2033)
		(102, 2032)
		(102, 2031)
		(102, 2030)
		(102, 2029)
		(102, 2028)
		(102, 2027)
		(102, 2026)
		(102, 2025)
		(102, 2024)
		(102, 2023)
		(102, 2022)
		(102, 2021)
		(102, 2020)
		(102, 2019)
		(102, 2018)
		(102, 2017)
		(102, 2016)
		(102, 2015)
		(102, 2014)
		(102, 2013)
		(102, 2012)
		(102, 2011)
		(102

		(102, 1502)
		(102, 1501)
		(102, 1500)
		(102, 1499)
		(102, 1498)
		(102, 1497)
		(102, 1496)
		(102, 1495)
		(102, 1494)
		(102, 1493)
		(102, 1492)
		(102, 1491)
		(102, 1490)
		(102, 1489)
		(102, 1488)
		(102, 1487)
		(102, 1486)
		(102, 1485)
		(102, 1484)
		(102, 1483)
		(102, 1482)
		(102, 1481)
		(102, 1480)
		(102, 1479)
		(102, 1478)
		(102, 1477)
		(102, 1476)
		(102, 1475)
		(102, 1474)
		(102, 1473)
		(102, 1472)
		(102, 1471)
		(102, 1470)
		(102, 1469)
		(102, 1468)
		(102, 1467)
		(102, 1466)
		(102, 1465)
		(102, 1464)
		(102, 1463)
		(102, 1462)
		(102, 1461)
		(102, 1460)
		(102, 1459)
		(102, 1458)
		(102, 1457)
		(102, 1456)
		(102, 1455)
		(102, 1454)
		(102, 1453)
		(102, 1452)
		(102, 1451)
		(102, 1450)
		(102, 1449)
		(102, 1448)
		(102, 1447)
		(102, 1446)
		(102, 1445)
		(102, 1444)
		(102, 1443)
		(102, 1442)
		(102, 1441)
		(102, 1440)
		(102, 1439)
		(102, 1438)
		(102, 1437)
		(102, 1436)
		(102, 1435)
		(102, 1434)
		(102, 1433)
		(102, 1432)
		(102

		(102, 912)
		(102, 911)
		(102, 910)
		(102, 909)
		(102, 908)
		(102, 907)
		(102, 906)
		(102, 905)
		(102, 904)
		(102, 903)
		(102, 902)
		(102, 901)
		(102, 900)
		(102, 899)
		(102, 898)
		(102, 897)
		(102, 896)
		(102, 895)
		(102, 894)
		(102, 893)
		(102, 892)
		(102, 891)
		(102, 890)
		(102, 890)
		(102, 889)
		(102, 888)
		(102, 887)
		(102, 886)
		(102, 886)
		(102, 885)
		(102, 884)
		(102, 883)
		(102, 882)
		(102, 881)
		(102, 880)
		(102, 879)
		(102, 878)
		(102, 877)
		(102, 876)
		(102, 875)
		(102, 874)
		(102, 873)
		(102, 872)
		(102, 871)
		(102, 870)
		(102, 869)
		(102, 868)
		(102, 867)
		(102, 866)
		(102, 865)
		(102, 864)
		(102, 863)
		(102, 862)
		(102, 861)
		(102, 860)
		(102, 859)
		(102, 858)
		(102, 857)
		(102, 856)
		(102, 855)
		(102, 854)
		(102, 853)
		(102, 852)
		(102, 851)
		(102, 850)
		(102, 849)
		(102, 848)
		(102, 847)
		(102, 846)
		(102, 845)
		(102, 844)
		(102, 843)
		(102, 842)
		(102, 841)
		(102, 840)
		(102, 839)
		(102, 838)

		(102, 288)
		(102, 287)
		(102, 286)
		(102, 285)
		(102, 284)
		(102, 283)
		(102, 282)
		(102, 281)
		(102, 280)
		(102, 279)
		(102, 278)
		(102, 277)
		(102, 276)
		(102, 275)
		(102, 274)
		(102, 273)
		(102, 272)
		(102, 271)
		(102, 270)
		(102, 269)
		(102, 268)
		(102, 267)
		(102, 266)
		(102, 265)
		(102, 264)
		(102, 263)
		(102, 262)
		(102, 261)
		(102, 260)
		(102, 259)
		(102, 258)
		(102, 257)
		(102, 256)
		(102, 255)
		(102, 254)
		(102, 253)
		(102, 252)
		(102, 251)
		(102, 250)
		(102, 249)
		(102, 248)
		(102, 247)
		(102, 246)
		(102, 245)
		(102, 244)
		(102, 243)
		(102, 242)
		(102, 241)
		(102, 240)
		(102, 239)
		(102, 238)
		(102, 237)
		(102, 236)
		(102, 235)
		(102, 234)
		(102, 233)
		(102, 232)
		(102, 231)
		(102, 230)
		(102, 229)
		(102, 228)
		(102, 227)
		(102, 226)
		(102, 225)
		(102, 224)
		(102, 223)
		(102, 222)
		(102, 221)
		(102, 220)
		(102, 219)
		(102, 218)
		(102, 217)
		(102, 216)
		(102, 215)
		(102, 214)
		(102, 213)
		(102, 212)

  d_inv = np.sqrt(np.diag(np.diag(K)))
  cor = -V_inv[0][1]/(np.sqrt(V_inv[0][0]*V_inv[1][1]))
  cor_m = np.minimum(cor_m, 0.9999)
  cor_m = np.maximum(cor_m, -0.9999)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


		(102, 98)
		(102, 98)
		(102, 98)
		(102, 98)
		(102, 98)
		(102, 98)


  cor = -V_inv[0][1]/(np.sqrt(V_inv[0][0]*V_inv[1][1]))


		(102, 98)
		(102, 98)
		(102, 98)
		(102, 98)
		(102, 97)


n_components was set to n_samples, which results in inefficient evaluation of the full kernel.


		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
		(102, 97)
Fast_OSFS {'accuracy': 0.41176470588235292, 'logloss': 10.777240152258324, 'feat_dim': (3220,)}
Base {'accuracy': 0.91176470588235292, 'logloss': 3.0475390936685902, 'feat_dim': (12600,)}


  np.exp(prob, prob)
