In [24]:
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from evoml.subspacing import FeatureStackerFECV

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def check_for_benchmarks(X_train, X_test, y_train, y_test, n_estimators):
    
    # The exploration of the dataset by benchmark algorithms
    clf = DecisionTreeClassifier(random_state=34092)
    clf.fit(X_train, y_train)
    pred_DTC = clf.predict(X_test)
    a = clf.score(X_test, y_test)
#     print('Base DecisionTreeClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

    clf = RandomForestClassifier(random_state=34092, n_estimators=n_estimators)
    clf.fit(X_train, y_train)
    pred_RFC = clf.predict(X_test)
    b = clf.score(X_test, y_test)
#     print('Base RandomForestClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

    clf = GradientBoostingClassifier(random_state=34092, n_estimators=n_estimators)
    clf.fit(X_train, y_train)
    pred_GBC = clf.predict(X_test)
    c = clf.score(X_test, y_test)
#     print('Base GradientBoostingClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

#     print('')
    return a,b,c

In [28]:
evoml_params={
    'name': 'Experiment',
    'n_estimators': 10,
    'cv': 5,
    'n_population': 30,
    'ngen' : 5
}

In [33]:
logdump = []

for i,dataset in tqdm_notebook(enumerate(glob('C:/Users/harshnisar/Programming/data/*'))):
    if i<5: #Max number of datasets you want to do
        continue
        
    input_data = pd.read_csv(dataset, compression='gzip', sep='\t')
    X, y = input_data.iloc[:,:-1], input_data.iloc[:,-1] 
    
    
    for seed in tqdm_notebook(range(0,5)): #How many per dataset?
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, stratify=y)
        dt_score, rf_score, gb_score = check_for_benchmarks(X_train, X_test, y_train, y_test, n_estimators=10)
        
        clf_dt = DecisionTreeClassifier(max_depth=None, random_state=34092)
        
        clf = FeatureStackerFECV(base_estimator=clf_dt, model_type = 'classification',
                                 N_individual=evoml_params['n_estimators'],
                                 ngen=evoml_params['ngen'], verbose_flag = False, N_population=evoml_params['n_population'],
                                 maxOrMin = 1,
                                 featMax = None, featMin=1, folds_CV=evoml_params['cv'])
        
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        final_test_score = accuracy_score(pred,y_test)

        
        final_train_fitness = clf.hof[0].fitness
        
        
        log ={
            'dataset': dataset,
            'seed': seed,
            'final_train_fitness': final_train_fitness,
            'final_test_score': final_test_score,
            'RF_holdout': rf_score,
            'DT_holdout': dt_score,
            'GB_holdout': gb_score,
            'nrows': X.shape[0],
            'ncols': X.shape[1]
        }
        
        logdump.append(log)






Exception ignored in: <bound method tqdm.__del__ of 29it [00:00, 370.92it/s]>
Traceback (most recent call last):
  File "C:\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 645, in __del__
    self.close()
  File "C:\Anaconda3\lib\site-packages\tqdm\_tqdm_notebook.py", line 208, in close
    if self.n < self.total:
TypeError: unorderable types: int() < NoneType()


ValueError: empty range for randrange() (1,1, 0)

In [35]:
logs = pd.DataFrame(logdump)

In [36]:
logs.dataset.unique().shape

(24,)

In [38]:
logs

Unnamed: 0,DT_holdout,GB_holdout,RF_holdout,dataset,final_test_score,final_train_fitness,ncols,nrows,seed
0,0.988335,0.983033,0.980912,C:/Users/harshnisar/Programming/data\allrep.cs...,0.992577,"(0.98515773110142324,)",29,3772,0
1,0.976670,0.979852,0.974549,C:/Users/harshnisar/Programming/data\allrep.cs...,0.978791,"(0.98267736658972571,)",29,3772,1
2,0.981972,0.972428,0.973489,C:/Users/harshnisar/Programming/data\allrep.cs...,0.983033,"(0.98409203614828145,)",29,3772,2
3,0.978791,0.971368,0.972428,C:/Users/harshnisar/Programming/data\allrep.cs...,0.981972,"(0.98409884048181928,)",29,3772,3
4,0.986214,0.985154,0.980912,C:/Users/harshnisar/Programming/data\allrep.cs...,0.987275,"(0.98515952134318341,)",29,3772,4
5,0.846154,0.692308,0.692308,C:/Users/harshnisar/Programming/data\analcatda...,0.846154,"(0.67500000000000004,)",4,50,0
6,0.615385,0.615385,0.692308,C:/Users/harshnisar/Programming/data\analcatda...,0.769231,"(0.67500000000000004,)",4,50,1
7,0.307692,0.461538,0.615385,C:/Users/harshnisar/Programming/data\analcatda...,0.461538,"(0.74285714285714288,)",4,50,2
8,0.538462,0.615385,0.538462,C:/Users/harshnisar/Programming/data\analcatda...,0.615385,"(0.69999999999999996,)",4,50,3
9,0.769231,0.692308,0.615385,C:/Users/harshnisar/Programming/data\analcatda...,0.692308,"(0.62976190476190474,)",4,50,4
