In [1]:
# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [3]:
# DUMMY DATA
N = int(2e5)
dim = 2
PARTITION = [[0], [1]]#[[0, 1], [2, 3], [4], [5]]
limit = 0.06 #0.63
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = np.vstack([np.all(np.abs(X[:, p]) < limit, axis=1) for p in PARTITION]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)

labeled_set = LabeledSet(y, y_subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.05, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 0.093 %


In [5]:
# SET-UP EXPLORATION CONFIGURATION
REPEAT = 1
NUMBER_OF_ITERATIONS = 50  # number of points to be labeled by the user

SUBSAMPLING = None 

INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)

CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(X, labeled_set.labels, ['fscore']), 
    #three_set_metric,
]
CALLBACK_SKIP = 5


CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    #metric_reached_threshold('tsm', 0.9),
]

#NOISE_INJECTOR = random_noise_injector(0)  # None, random_noise_injector, gaussian_noise_injector

SEED = [0]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, CONVERGENCE_CRITERIA)

# ACTIVE LEARNING ALGORITHMS
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
#learner = SimpleMargin(C=1e7, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model
#learner = KernelVersionSpace(n_samples=16, warmup=100, thin=100, strategy='opt', rounding_cache=True)  # version space algorithm
#learner = BayesianKernelVersionSpace(n_samples=16, warmup=100, thin=100, sampler='stan', prior='improper', prior_std=100)  # version space algorithm
learner = BayesianKernelVersionSpace(sampler='approximate', prior='improper', n_samples=100, prior_std=1000)  # version space algorithm

# FACTORIZED ALGORITHMS
#PARTITION = [[0], [1]]
#learner = FactorizedDualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), partition=PARTITION, mode='positive', sample_unknown_proba=0.5)  # Dual Space model
#learner = SubspatialVersionSpace(n_samples=8, warmup=100, thin=100, rounding=True, rounding_cache=True, z_cut=True, use_cython=True, strategy='opt', partition=PARTITION, label_function='AND', loss='GREEDY')
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')


# RUN EXPLORATION
#metrics = explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED)  # 'repeat' specifies how many times to repeat the exploration process
dfs = []

#import logging
#logging.getLogger('pystan').setLevel(logging.ERROR)


for run in explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED, return_generator=True):
    for i, m in enumerate(run): 
        if 'fscore' in m:
            print('iter:', i, ', fscore:', m['fscore'], 'iter_time:', m['iter_time'])    

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#dfs[0]

iter: 1 , fscore: 0.00262786097767731 iter_time: 0.03423679700000193
iter: 6 , fscore: 0.01840855106888361 iter_time: 0.06291854200000557
iter: 11 , fscore: 0.05808869456589631 iter_time: 0.08314439999999479
iter: 16 , fscore: 0.07261853908586074 iter_time: 0.10810439400000149
iter: 21 , fscore: 0.2884012539184953 iter_time: 0.13141552899999454
iter: 26 , fscore: 0.42448979591836733 iter_time: 0.15689795000000117
iter: 31 , fscore: 0.8054298642533937 iter_time: 0.19642608899999914
iter: 36 , fscore: 0.9043927648578811 iter_time: 0.21955241700000272
iter: 41 , fscore: 0.9067357512953368 iter_time: 0.2522048689999963
iter: 46 , fscore: 0.9114583333333334 iter_time: 0.28106112399999716
iter: 51 , fscore: 0.9114583333333334 iter_time: 0.30975556300000306
