In [1]:
# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [3]:
# DUMMY DATA
N = int(2e5)
dim = 2
PARTITION = [[0], [1]]#[[0, 1], [2, 3], [4], [5]]
limit = 0.06 #0.63
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = np.vstack([np.all(np.abs(X[:, p]) < limit, axis=1) for p in PARTITION]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)

labeled_set = LabeledSet(y, y_subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.05, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 0.093 %


In [None]:
# SET-UP EXPLORATION CONFIGURATION
REPEAT = 1
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user

SUBSAMPLING = None 

INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)

CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(X, labeled_set.labels, ['fscore']), 
    #three_set_metric,
]
CALLBACK_SKIP = 10


CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    #metric_reached_threshold('tsm', 0.9),
]

#NOISE_INJECTOR = random_noise_injector(0)  # None, random_noise_injector, gaussian_noise_injector

SEED = list(range(REPEAT))

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, CONVERGENCE_CRITERIA)

# ACTIVE LEARNING ALGORITHMS
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
#learner = SimpleMargin(C=1e7, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model
#learner = KernelVersionSpace(n_samples=8, warmup=100, thin=10, strategy='opt', rounding_cache=True)  # version space algorithm
learner = BayesianKernelVersionSpace(n_samples=8, warmup=100, thin=10, sigma=100)  # version space algorithm

# FACTORIZED ALGORITHMS
#PARTITION = [[0], [1]]
#learner = FactorizedDualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), partition=PARTITION, mode='positive', sample_unknown_proba=0.5)  # Dual Space model
#learner = SubspatialVersionSpace(n_samples=8, warmup=100, thin=100, rounding=True, rounding_cache=True, z_cut=True, use_cython=True, strategy='opt', partition=PARTITION, label_function='AND', loss='GREEDY')
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')


# RUN EXPLORATION
#metrics = explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED)  # 'repeat' specifies how many times to repeat the exploration process
dfs = []
for run in explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED, return_generator=True):
    for m in run: 
        if 'fscore' in m:
            print(m)    

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
#dfs[0]



{'phase': 'initial_sampling', 'labeled_indexes': [105981, 46502], 'final_labels': [1.0, 0.0], 'partial_labels': [[1.0, 1.0], [0.0, 0.0]], 'fit_time': 0.022930716000018947, 'min_rank': 1.4181146213210205e-05, 'get_next_time': 0.044922693999978947, 'iter_time': 0.0680006350000042, 'callback_time': 0.9206261260000019, 'fscore': 0.002354966954495961}




{'phase': 'exploration', 'labeled_indexes': [25472], 'final_labels': [0.0], 'partial_labels': [[0.0, 0.0]], 'fit_time': 0.42089871400000334, 'min_rank': 6.36832620595218e-05, 'get_next_time': 0.06637671000001433, 'iter_time': 0.4874632599999984, 'callback_time': 0.87537363200002, 'fscore': 0.17748091603053434}




{'phase': 'exploration', 'labeled_indexes': [194833], 'final_labels': [1.0], 'partial_labels': [[1.0, 1.0]], 'fit_time': 1.0799497790000032, 'min_rank': 0.00031415172890736365, 'get_next_time': 0.08026963500000761, 'iter_time': 1.1603647340000123, 'callback_time': 0.9199320569999827, 'fscore': 0.6850828729281768}




{'phase': 'exploration', 'labeled_indexes': [180164], 'final_labels': [1.0], 'partial_labels': [[1.0, 1.0]], 'fit_time': 1.3993509100000097, 'min_rank': 0.000220150275824893, 'get_next_time': 0.11497390100001326, 'iter_time': 1.5144686630000024, 'callback_time': 0.9297435630000166, 'fscore': 0.7035175879396985}




{'phase': 'exploration', 'labeled_indexes': [53711], 'final_labels': [0.0], 'partial_labels': [[1.0, 0.0]], 'fit_time': 1.8924466980000147, 'min_rank': 0.0005105926540216466, 'get_next_time': 0.15690832200002092, 'iter_time': 2.0495019939999963, 'callback_time': 0.9856910169999935, 'fscore': 0.8246445497630331}




{'phase': 'exploration', 'labeled_indexes': [140461], 'final_labels': [1.0], 'partial_labels': [[1.0, 1.0]], 'fit_time': 2.51715201799999, 'min_rank': 0.0011190082873328011, 'get_next_time': 0.1849752409999894, 'iter_time': 2.70230315500001, 'callback_time': 1.0124940050000077, 'fscore': 0.8594594594594595}


