In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import (accuracy_score, precision_score, classification_report,
                             recall_score, f1_score, log_loss)

In [2]:
from pattern_clf import *
from datasets import *

# Estimation

In [3]:
clf = LazyPatternClassifier()

In [7]:
param_grid = {
    'tolerance': np.logspace(-8, -6, 1),
    'weights_iters': [0, 1, 3],
    'weight_classifiers': [False, True]
}
grid = GridSearchCV(clf, param_grid,
                    scoring=['f1', 'accuracy'],
                    cv=KFold(n_splits=3, shuffle=True, random_state=495),
                    verbose=2, return_train_score=True,
                    n_jobs=1, refit=False)

In [5]:
loaders = [
    #('Breast Cancer', get_breast_cancer),
    ('Heart Disease', get_heart_disease),
    #('Mammographic Mass', get_mammographic_mass),
    #('Seismic Bumps', get_seismic_bumps),
    #('Titanic', get_titanic),
    #('Breast GSE', get_breast_GSE),
    #('Liver GSE', get_liver_GSE),
    #('Prostate GSE', get_prostate_GSE),
]

In [9]:
result = {}
for name, loader in loaders:
    X, y = loader()
    grid.fit(X, y)
    result[name] = grid.cv_results_.copy()
    print('Completed:', name)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=0 ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=0, total=   8.8s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=0 ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.3s remaining:    0.0s


[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=0, total=   8.6s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=0 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=0, total=   9.1s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=1 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=1, total=   9.7s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=1 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=1, total=   9.9s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=1 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=1, total=  10.0s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=3 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=3, total=   9.0s
[CV] tolerance=1e-08, weight_classifiers=False, weights_iters=3 ......
[CV]  tolerance=1e-08, weight_classifiers=False, weights_iters=3, total=   9.7s
[CV] tolerance

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  4.6min finished


In [10]:
grid.cv_results_['mean_test_f1']

array([0.8182409 , 0.8182409 , 0.8182409 , 0.8182409 , 0.65124579,
       0.69435626])

In [12]:
import pickle
with open('results.pkl', 'wb') as f:
    pickle.dump(result, f)
!chmod -w 'results.pkl'

# Evaluation

In [16]:
import pickle
with open('results.pkl', 'wb') as f:
    pickle.dump(result, f)

In [18]:
frames = {key: pd.DataFrame(value) for key, value in result.items()}

In [19]:
for key, value in frames.items():
    value['dataset'] = key
df = pd.concat(frames.values()).reset_index(drop=True)
df.sample(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tolerance,param_use_softmax,param_weight_classifiers,param_weights_iters,param_weights_strategy,params,...,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,mean_train_accuracy,std_train_accuracy,dataset
47,0.015586,0.011309,0.0,0.0,1e-06,True,True,5,uniform,"{'tolerance': 1e-06, 'use_softmax': True, 'wei...",...,,,,16,,,,,,Mammographic Mass
30,0.004163,0.000248,0.0,0.0,1e-06,True,True,5,from_objects,"{'tolerance': 1e-06, 'use_softmax': True, 'wei...",...,,,,15,,,,,,Heart Disease
11,0.004342,0.000244,0.0,0.0,1e-06,True,False,5,uniform,"{'tolerance': 1e-06, 'use_softmax': True, 'wei...",...,,,,12,,,,,,Breast Cancer
41,0.004466,5.6e-05,0.0,0.0,1e-06,True,False,1,uniform,"{'tolerance': 1e-06, 'use_softmax': True, 'wei...",...,,,,10,,,,,,Mammographic Mass
55,0.007545,0.002996,0.0,0.0,1e-08,True,True,5,uniform,"{'tolerance': 1e-08, 'use_softmax': True, 'wei...",...,,,,8,,,,,,Titanic


In [20]:
df = df[['param_tolerance',
          'param_weight_classifiers',
          'param_weights_iters',
          'param_weights_strategy',
          'dataset',
          'mean_test_accuracy',
          'mean_test_f1']]
df

Unnamed: 0,param_tolerance,param_weight_classifiers,param_weights_iters,param_weights_strategy,dataset,mean_test_accuracy,mean_test_f1
0,1e-08,False,1,from_objects,Breast Cancer,,
1,1e-08,False,1,uniform,Breast Cancer,,
2,1e-08,False,5,from_objects,Breast Cancer,,
3,1e-08,False,5,uniform,Breast Cancer,,
4,1e-08,True,1,from_objects,Breast Cancer,,
5,1e-08,True,1,uniform,Breast Cancer,,
6,1e-08,True,5,from_objects,Breast Cancer,,
7,1e-08,True,5,uniform,Breast Cancer,,
8,1e-06,False,1,from_objects,Breast Cancer,,
9,1e-06,False,1,uniform,Breast Cancer,,


In [21]:
params = ['param_tolerance',
 'param_weight_classifiers',
 'param_weights_iters',
 'param_weights_strategy']
target = 'mean_test_accuracy'
#target = 'mean_test_f1'

In [None]:
df2 = df.copy()
for param in params:
    df2 = df2.set_index(param, drop=True, append=(param is not params[0]))
df3 = df2.iloc[:df2.shape[0] // df2.dataset.nunique()]
df3.drop(columns=df3.columns, inplace=True)
for dataset in df2.dataset.unique():
    df3[dataset] = df2[target].values[df2.dataset == dataset]
df3

In [24]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Breast Cancer,Heart Disease,Mammographic Mass,Titanic
param_tolerance,param_weight_classifiers,param_weights_iters,param_weights_strategy,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1e-08,False,1,from_objects,0.704753,0.792079,0.777111,0.774411
1e-08,False,1,uniform,0.704753,0.792079,0.777111,0.774411
1e-08,False,5,from_objects,0.704753,0.792079,0.777111,0.774411
1e-08,False,5,uniform,0.704753,0.792079,0.777111,0.774411
1e-08,True,1,from_objects,0.372598,0.610561,0.78073,0.755331
1e-08,True,1,uniform,0.704753,0.792079,0.777111,0.774411
1e-08,True,5,from_objects,0.372598,0.610561,0.78073,0.755331
1e-08,True,5,uniform,0.704753,0.792079,0.777111,0.774411
1e-06,False,1,from_objects,0.704753,0.792079,0.777111,0.774411
1e-06,False,1,uniform,0.704753,0.792079,0.777111,0.774411


In [29]:
val = df3.values.reshape(2, 2, 2, 2, 4)

In [51]:
val[:, :, :, [0], :].max(axis=(0,1,2))

array([[0.704753, 0.792079, 0.78073 , 0.774411]])

In [50]:
val[:, :, :, [1], :]

array([[[[[0.704753, 0.792079, 0.777111, 0.774411]],

         [[0.704753, 0.792079, 0.777111, 0.774411]]],


        [[[0.704753, 0.792079, 0.777111, 0.774411]],

         [[0.704753, 0.792079, 0.777111, 0.774411]]]],



       [[[[0.704753, 0.792079, 0.777111, 0.774411]],

         [[0.704753, 0.792079, 0.777111, 0.774411]]],


        [[[0.704753, 0.792079, 0.777111, 0.774411]],

         [[0.704753, 0.792079, 0.777111, 0.774411]]]]])