In [1]:
# add path to explore_by_example src folder
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [3]:
# DUMMY DATA
N = int(1e5)
dim = 2
limit = 2 * (0.001)**(1. / dim)
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = np.vstack([np.abs(X[:, i]) < limit for i in range(dim)]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)

index = -10 * np.arange(len(X))
labeled_set = LabeledSet(y, y_subspace, index)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.5, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 0.105 %


# NO FACTORIZATION EXAMPLE

In [4]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user
SUBSAMPLING = None
INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)
CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(y, 'fscore'), 
    three_set_metric,
]
CALLBACK_SKIP = 10
PRINT_CALLBACK_RESULT = True
CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    #metric_reached_threshold('tsm', 0.9),
]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT, CONVERGENCE_CRITERIA)

# CHOOSE AN ALGORITHM
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
learner = SimpleMargin(C=1024, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = KernelQueryByCommittee(kernel='rbf', sampling='deterministic', n_samples=8, warmup=1000, thin=100, rounding=True)  # version space algorithm
learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, labeled_set, learner, repeat=1)  # 'repeat' specifies how many times to repeat the exploration process

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#plot_fscore(metrics)
df = pd.DataFrame.from_dict({i: metric for i, metric in enumerate(metrics[0])}, orient='index')
df

iter: 1, fscore: 0.002870617182694279, tsm: 0.0
iter: 11, fscore: 0.02484325091683426, tsm: 0.0
iter: 21, fscore: 0.09134406263592866, tsm: 0.0
iter: 31, fscore: 0.0, tsm: 0.0
iter: 41, fscore: 0.3464566929133858, tsm: 0.0010124718118643288
iter: 51, fscore: 0.5895953757225434, tsm: 0.00546448087431694
iter: 61, fscore: 0.7220216606498195, tsm: 0.09251101321585903
iter: 71, fscore: 0.9532710280373832, tsm: 0.398989898989899
iter: 81, fscore: 0.9488372093023256, tsm: 0.6222222222222222
iter: 91, fscore: 0.9603960396039604, tsm: 0.7913043478260869
iter: 101, fscore: 0.995260663507109, tsm: 0.9090909090909091


Unnamed: 0,phase,iter_time,labeled_indexes,final_labels,partial_labels,fit_time,get_next_time,fscore,tsm,callback_time
0,begin,0.006529,,,,,,,,
1,initial_sampling,0.039807,"[-88110, -329520]","[1.0, 0.0]","[[1.0, 1.0], [0.0, 0.0]]",0.002033,0.037774,0.002871,0.000000,0.441543
2,exploration,0.037907,[-423290],[0.0],"[[0.0, 0.0]]",0.001032,0.036875,,,
3,exploration,0.038480,[-303830],[0.0],"[[0.0, 0.0]]",0.000692,0.037788,,,
4,exploration,0.037251,[-574760],[0.0],"[[1.0, 0.0]]",0.000689,0.036562,,,
...,...,...,...,...,...,...,...,...,...,...
97,exploration,0.011156,[-923380],[1.0],"[[1.0, 1.0]]",0.009793,0.001363,,,
98,exploration,0.150496,[-718480],[0.0],"[[0.0, 0.0]]",0.002356,0.148140,,,
99,exploration,0.160115,[-313200],[1.0],"[[1.0, 1.0]]",0.009671,0.150444,,,
100,exploration,0.156564,[-827590],[0.0],"[[0.0, 1.0]]",0.002747,0.153818,,,


# FACTORIZATION EXAMPLE

In [5]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user
SUBSAMPLING = None
INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)
CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(y, 'fscore'), 
    three_set_metric,
]
CALLBACK_SKIP = 10
PRINT_CALLBACK_RESULT = True
CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    metric_reached_threshold('tsm', 1.0),
]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT, CONVERGENCE_CRITERIA)

# Factorized AL algorithms
PARTITION = [[i] for i in range(dim)]  #[[0], [1]]

# FACTORIZED VERSION SPACE
# label_function = 'AND', 'OR', 'PROD'
# loss = 'GREEDY', 'SQUARED', 'PRODUCT'
#learner = SubspatialVersionSpace(warmup=100, thin=10, n_samples=8, rounding=True, kernel='rbf', gamma=None, partition=PARTITION, label_function='AND', loss='GREEDY')

# FACTORIZED SIMPLE MARGIN
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')

# FACTORIZED DSM
learner = DualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), mode='positive', sample_unknown_proba=0.5, partition=PARTITION)  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, labeled_set, learner, repeat=1)  # use y_subspace here

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#plot_fscore(metrics)
df = pd.DataFrame.from_dict({i: metric for i, metric in enumerate(metrics[0])}, orient='index')
df

iter: 1, fscore: 0.0030335861321776816, tsm: 0.0
iter: 11, fscore: 0.14641080312722105, tsm: 0.0
iter: 21, fscore: 0.2870159453302961, tsm: 0.08490566037735849
iter: 31, fscore: 0.9447236180904522, tsm: 0.7899159663865546
iter: 41, fscore: 1.0, tsm: 0.9904761904761905
iter: 51, fscore: 1.0, tsm: 1.0


Unnamed: 0,phase,iter_time,labeled_indexes,final_labels,partial_labels,fit_time,get_next_time,fscore,tsm,callback_time
0,begin,0.00419,,,,,,,,
1,initial_sampling,0.272461,"[-917820, -776370]","[1.0, 0.0]","[[1.0, 1.0], [0.0, 0.0]]",0.247258,0.025203,0.003034,0.0,0.433486
2,exploration,0.293595,[-795840],[0.0],"[[0.0, 0.0]]",0.278006,0.01559,,,
3,exploration,0.166212,[-941380],[0.0],"[[0.0, 0.0]]",0.120943,0.045269,,,
4,exploration,0.142163,[-667260],[0.0],"[[0.0, 0.0]]",0.096577,0.045587,,,
5,exploration,0.160245,[-668230],[0.0],"[[0.0, 0.0]]",0.157072,0.003173,,,
6,exploration,0.015029,[-215170],[0.0],"[[0.0, 0.0]]",0.01086,0.004169,,,
7,exploration,0.012858,[-357960],[0.0],"[[0.0, 1.0]]",0.010392,0.002466,,,
8,exploration,0.021731,[-644490],[0.0],"[[0.0, 0.0]]",0.019958,0.001773,,,
9,exploration,0.083708,[-275300],[0.0],"[[0.0, 0.0]]",0.009805,0.073902,,,
