In [1]:
# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()


def train_score(dataset, active_learner):
    X_train, y_train = dataset.training_set()
    y_pred = active_learner.predict(X_train)
    return {'train_score': f1_score(y_train, y_pred)}

In [3]:
# DUMMY DATA
N = int(2e5)
dim = 2
PARTITION = [[0], [1]]#[[0, 1], [2, 3], [4], [5]]
limit = 2 * (0.001) ** (1/dim) #0.06 #0.63
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = 1 - np.vstack([np.all(np.abs(X[:, p]) < limit, axis=1) for p in PARTITION]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)

labeled_set = LabeledSet(y, y_subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.05, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 93.8215 %


In [5]:
# SET-UP EXPLORATION CONFIGURATION
REPEAT = 1
NUMBER_OF_ITERATIONS = 10  # number of points to be labeled by the user

SUBSAMPLING = None 

INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)


CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(X, labeled_set.labels, ['fscore']), 
    train_score,
    #three_set_metric,
]
CALLBACK_SKIP = 5


CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    #metric_reached_threshold('tsm', 0.9),
]

NOISE_INJECTOR = random_noise_injector(noise=1., skip_initial=0)  # None, random_noise_injector, gaussian_noise_injector

SEED = [0]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, CONVERGENCE_CRITERIA, NOISE_INJECTOR)

# ACTIVE LEARNING ALGORITHMS
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
#learner = SimpleMargin(C=1e7, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model
#learner = KernelVersionSpace(n_samples=16, warmup=100, thin=100, strategy='opt', rounding_cache=True)  # version space algorithm

#learner = BayesianKernelVersionSpace(n_samples=8, warmup=1000, thin=100, sampler='stan', prior='improper', prior_std=100)  # version space algorithm
#learner = BayesianKernelVersionSpace(n_samples=32, sampler='laplace', prior='improper', prior_std=1000)  # version space algorithm
learner = BayesianKernelVersionSpace(sampler='laplace', prior='gaussian', prior_std=1e1, add_intercept=False)  # version space algorithm
#learner = BayesianKernelVersionSpace(sampler='kernel-laplace', prior_std=1e6)  # version space algorithm

#learner = BayesianLinearVersionSpace(n_samples=10000, sampler='laplace', prior='improper', add_intercept=True)  # version space algorithm
#learner = BayesianLinearVersionSpace(sampler='stan', prior='improper', prior_std=100)  # version space algorithm

# FACTORIZED ALGORITHMS
#PARTITION = [[0], [1]]
#learner = FactorizedDualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), partition=PARTITION, mode='positive', sample_unknown_proba=0.5)  # Dual Space model
#learner = SubspatialVersionSpace(n_samples=8, warmup=100, thin=100, rounding=True, rounding_cache=True, z_cut=True, use_cython=True, strategy='opt', partition=PARTITION, label_function='AND', loss='GREEDY')
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')


# RUN EXPLORATION
#metrics = explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED)  # 'repeat' specifies how many times to repeat the exploration process
dfs = []

#import logging
#logging.getLogger('pystan').setLevel(logging.ERROR)


for run in explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED, return_generator=True):
    dfs.append(pd.DataFrame(run))
    #for i, m in enumerate(run): 
    #    print(m)
    #    if 'fscore' in m:
    #        print('ITER:', i, ', FSCORE-TRAIN:', m['train_score'], ', FSCORE-TEST:', m['fscore'], ', ITER_TIME:', m['iter_time'])
    #        print('ITER:', i, ', FSCORE-TRAIN:', m['train_score'], ', FSCORE-TEST:', m['fscore'], ', ITER_TIME:', m['iter_time'], ', FIT_TIME:', m['fit_time'], ', GET_NEXT_TIME:', m['get_next_time'])


# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
dfs[0]

Unnamed: 0,phase,labeled_indexes,final_labels,partial_labels,get_next_time,fit_time,iter_time,callback_time,fscore,train_score,noisy_final_labels,noisy_partial_labels,min_rank
0,initial_sampling,"[152731, 17800]","[1.0, 0.0]","[[1.0, 1.0], [1.0, 0.0]]",0.004189,0.000786,0.005047,0.805298,0.414127,1.0,,,
1,exploration,[12138],[0.0],"[[1.0, 0.0]]",0.026848,0.001194,0.028237,,,,[1.0],"[[1.0, 1.0]]",9.3515e-06
2,exploration,[37082],[1.0],"[[1.0, 1.0]]",0.029676,0.001017,0.030886,,,,[0.0],"[[1.0, 0.0]]",4.902023e-06
3,exploration,[60944],[1.0],"[[1.0, 1.0]]",0.034116,0.001229,0.035538,,,,[0.0],"[[1.0, 0.0]]",2.868565e-06
4,exploration,[144143],[1.0],"[[1.0, 1.0]]",0.037623,0.00141,0.039275,,,,[0.0],"[[1.0, 0.0]]",1.774498e-06
5,exploration,[171473],[1.0],"[[1.0, 1.0]]",0.041702,0.001316,0.04321,0.820286,0.284363,1.0,[0.0],"[[1.0, 0.0]]",1.113348e-05
6,exploration,[51991],[1.0],"[[1.0, 1.0]]",0.051243,0.001388,0.052827,,,,[0.0],"[[0.0, 1.0]]",1.162867e-05
7,exploration,[78093],[1.0],"[[1.0, 1.0]]",0.053495,0.001664,0.055352,,,,[0.0],"[[1.0, 0.0]]",5.817049e-07
8,exploration,[17003],[1.0],"[[1.0, 1.0]]",0.057985,0.001519,0.059701,,,,[0.0],"[[1.0, 0.0]]",2.946266e-06
9,exploration,[194418],[1.0],"[[1.0, 1.0]]",0.06223,0.001578,0.064006,,,,[0.0],"[[0.0, 1.0]]",8.158481e-06
