In [1]:
# add path to explore_by_example src folder
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme.active_learning import *
from aideme.initial_sampling import *
from aideme.explore import *
from aideme.utils.metrics import three_set_metric, classification_metrics
from aideme.utils.plotting import plot_polytope
from aideme.utils.convergence import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [3]:
# DUMMY DATA
N = int(1e5)
dim = 2
limit = 2 * (0.001)**(1. / dim)
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = np.vstack([np.abs(X[:, i]) < limit for i in range(dim)]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)
user = DummyUser(y, y_subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.5, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 0.105 %


# NO FACTORIZATION EXAMPLE

In [5]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user
SUBSAMPLING = float('inf')
INITIAL_SAMPLER = StratifiedSampler(pos=1, neg=1)  # start with one random positive sample and one random negative sample
CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics('fscore'), 
    three_set_metric,
]
CALLBACK_SKIP = 10
PRINT_CALLBACK_RESULT = True
CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    metric_reached_threshold('tsm', 0.9),
]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT, CONVERGENCE_CRITERIA)

# CHOOSE AN ALGORITHM
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
learner = SimpleMargin(C=1024, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = KernelQueryByCommittee(kernel='rbf', sampling='deterministic', n_samples=8, warmup=1000, thin=100, rounding=True)  # version space algorithm
learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, user, learner, repeat=1)  # 'repeat' specifies how many times to repeat the exploration process

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#plot_fscore(metrics)
df = pd.DataFrame.from_dict({i: metric for i, metric in enumerate(metrics[0])}, orient='index')
df

iter: 0, fscore: 0.002468004089835349, tsm: 0.0
iter: 10, fscore: 0.01977401129943503, tsm: 0.0
iter: 20, fscore: 0.09781089892873777, tsm: 0.0
iter: 30, fscore: 0.22826086956521738, tsm: 0.0
iter: 40, fscore: 0.39473684210526316, tsm: 0.00023676577945267645
iter: 50, fscore: 0.8717948717948718, tsm: 0.002413127413127413
iter: 60, fscore: 0.7183098591549296, tsm: 0.04536082474226804
iter: 70, fscore: 0.9082969432314411, tsm: 0.3130841121495327
iter: 80, fscore: 0.9327354260089686, tsm: 0.5657894736842105
iter: 90, fscore: 0.9417040358744395, tsm: 0.7479674796747967
iter: 100, fscore: 0.9758454106280193, tsm: 0.8909090909090909


Unnamed: 0,phase,labeled_indexes,final_labels,partial_labels,fit_time,get_next_time,iter_time,fscore,tsm
0,initial_sampling,"[58089, 8720]","[1.0, 0.0]","[[1.0, 1.0], [0.0, 0.0]]",0.000766,0.056832,0.057597,0.002468,0.000000
1,exploration,[89325],[0.0],"[[0.0, 0.0]]",0.000693,0.031817,0.032510,,
2,exploration,[84752],[0.0],"[[0.0, 0.0]]",0.000440,0.033540,0.033981,,
3,exploration,[67298],[0.0],"[[0.0, 0.0]]",0.000433,0.033318,0.033751,,
4,exploration,[83743],[0.0],"[[0.0, 0.0]]",0.000441,0.034532,0.034974,,
...,...,...,...,...,...,...,...,...,...
96,exploration,[56552],[1.0],"[[1.0, 1.0]]",0.009786,0.135116,0.144902,,
97,exploration,[54902],[0.0],"[[0.0, 1.0]]",0.002707,0.001954,0.004660,,
98,exploration,[25252],[0.0],"[[1.0, 0.0]]",0.003099,0.295549,0.298648,,
99,exploration,[52303],[1.0],"[[1.0, 1.0]]",0.014457,0.287342,0.301799,,


# FACTORIZATION EXAMPLE

In [5]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user
SUBSAMPLING = float('inf')
INITIAL_SAMPLER = StratifiedSampler(pos=1, neg=1)  # start with one random positive sample and one random negative sample
CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics('fscore'), 
    three_set_metric,
]
CALLBACK_SKIP = 10
PRINT_CALLBACK_RESULT = True
CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    metric_reached_threshold('fscore', 0.99),
    metric_reached_threshold('tsm', 0.9),
]

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT, CONVERGENCE_CRITERIA)

# Factorized AL algorithms
PARTITION = [[i] for i in range(dim)]  #[[0], [1]]

# FACTORIZED VERSION SPACE
# label_function = 'AND', 'OR', 'PROD'
# loss = 'GREEDY', 'SQUARED', 'PRODUCT'
learner = SubspatialVersionSpace(warmup=100, thin=10, n_samples=8, rounding=True, kernel='rbf', gamma=None, partition=PARTITION, label_function='AND', loss='GREEDY')

# FACTORIZED SIMPLE MARGIN
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')

# FACTORIZED DSM
#learner = DualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), mode='positive', sample_unknown_proba=0.5, partition=PARTITION)  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, user, learner, repeat=1)  # use y_subspace here

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#plot_fscore(metrics)
df = pd.DataFrame.from_dict({i: metric for i, metric in enumerate(metrics[0])}, orient='index')
df

iter: 0, fscore: 0.0049655955167766195
iter: 10, fscore: 0.4046242774566474
iter: 20, fscore: 0.9569377990430622
iter: 30, fscore: 0.9905660377358491


Unnamed: 0,phase,labeled_indexes,final_labels,partial_labels,fit_time,get_next_time,iter_time,fscore
0,initial_sampling,"[67605, 42733]","[1.0, 0.0]","[[1.0, 1.0], [0.0, 0.0]]",0.031007,0.019459,0.050465,0.004966
1,exploration,[92736],[0.0],"[[0.0, 0.0]]",0.017532,0.018077,0.035609,
2,exploration,[31980],[0.0],"[[0.0, 0.0]]",0.019274,0.018703,0.037977,
3,exploration,[74906],[0.0],"[[0.0, 0.0]]",0.021133,0.021117,0.042251,
4,exploration,[20815],[0.0],"[[0.0, 0.0]]",0.030178,0.022749,0.052926,
5,exploration,[82944],[0.0],"[[0.0, 0.0]]",0.027026,0.025616,0.052643,
6,exploration,[26497],[0.0],"[[0.0, 0.0]]",0.033403,0.019252,0.052655,
7,exploration,[56231],[0.0],"[[0.0, 0.0]]",0.032554,0.023918,0.056472,
8,exploration,[90618],[0.0],"[[0.0, 0.0]]",0.028923,0.024838,0.053762,
9,exploration,[50851],[0.0],"[[0.0, 1.0]]",0.030705,0.026854,0.057559,
