In [1]:
# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme import *

%matplotlib inline

In [2]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [3]:
# DUMMY DATA
N = int(2e5)
dim = 2
PARTITION = [[0], [1]]#[[0, 1], [2, 3], [4], [5]]
limit = 0.06 #0.63
rng = np.random.RandomState(0)
X = rng.uniform(low=-2, high=2, size=(N, dim))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y_subspace = np.vstack([np.all(np.abs(X[:, p]) < limit, axis=1) for p in PARTITION]).T.astype('float')  # partial labels (for each subspace)
y = y_subspace.min(axis=1)

labeled_set = LabeledSet(y, y_subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.05, c=['b' if lb else 'r' for lb in y])
#plt.show()

selectivity : 0.093 %


In [6]:
# SET-UP EXPLORATION CONFIGURATION
REPEAT = 1
NUMBER_OF_ITERATIONS = 100  # number of points to be labeled by the user

SUBSAMPLING = None 

INITIAL_SAMPLER = stratified_sampler(labeled_set, pos=1, neg=1)  # start with one random positive sample and one random negative sample
#INITIAL_SAMPLER = random_sampler(10)

CALLBACK = [ # callback functions to be called at the end of each iteration
    classification_metrics(X, labeled_set.labels, ['fscore']), 
    #three_set_metric,
]
CALLBACK_SKIP = 10


CONVERGENCE_CRITERIA = [
    max_iter_reached(NUMBER_OF_ITERATIONS),
    #all_points_are_known,
    #metric_reached_threshold('fscore', 0.8),
    #metric_reached_threshold('tsm', 0.9),
]

#NOISE_INJECTOR = random_noise_injector(0)  # None, random_noise_injector, gaussian_noise_injector

SEED = list(range(REPEAT))

explore = PoolBasedExploration(INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, CONVERGENCE_CRITERIA)

# ACTIVE LEARNING ALGORITHMS
#learner = RandomSampler(SVC(C=1e5, kernel='rbf', gamma='auto'))  # choose a random point
#learner = SimpleMargin(C=1e7, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = DualSpaceModel(learner, sample_unknown_proba=0.5, mode='positive')  # Dual Space model
learner = KernelVersionSpace(n_samples=8, warmup=100, thin=10, strategy='opt', rounding_cache=True)  # version space algorithm

# FACTORIZED ALGORITHMS
#PARTITION = [[0], [1]]
#learner = FactorizedDualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), partition=PARTITION, mode='positive', sample_unknown_proba=0.5)  # Dual Space model
#learner = SubspatialVersionSpace(n_samples=8, warmup=100, thin=100, rounding=True, rounding_cache=True, z_cut=True, use_cython=True, strategy='opt', partition=PARTITION, label_function='AND', loss='GREEDY')
#learner = SubspatialSimpleMargin(C=1024, kernel='rbf', gamma=5, partition=PARTITION, label_function='AND')


# RUN EXPLORATION
#metrics = explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED)  # 'repeat' specifies how many times to repeat the exploration process
dfs = []
for run in explore.run(X, labeled_set, learner, repeat=REPEAT, seeds=SEED, return_generator=True):
    #for m in run: print(m)
    dfs.append(pd.DataFrame(run))
    

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
dfs[0]

Unnamed: 0,phase,iter_time,labeled_indexes,final_labels,partial_labels,rounding_iters,rounding_fit_time,hit_and_run_time,hit_and_run_steps,fit_time,min_rank,get_next_time,callback_time,fscore
0,begin,0.003743,,,,,,,,,,,,
1,initial_sampling,0.032587,"[105981, 46502]","[1.0, 0.0]","[[1.0, 1.0], [0.0, 0.0]]",[2],0.000232,0.002285,170.0,0.002933,0.0,0.029561,0.812321,0.003740
2,exploration,0.034447,[161320],[0.0],"[[0.0, 0.0]]",[2],0.000212,0.002289,170.0,0.002930,0.0,0.031386,,
3,exploration,0.030394,[115370],[0.0],"[[0.0, 0.0]]",[1],0.000135,0.002283,170.0,0.002826,0.0,0.027445,,
4,exploration,0.027474,[51536],[0.0],"[[0.0, 0.0]]",[1],0.000138,0.002438,170.0,0.002986,0.0,0.024367,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,exploration,0.318602,[136453],[0.0],"[[0.0, 0.0]]",[0],0.000111,0.003976,170.0,0.004940,0.5,0.313513,,
98,exploration,0.319805,[191019],[0.0],"[[0.0, 0.0]]",[0],0.000168,0.003895,170.0,0.004980,0.5,0.314674,,
99,exploration,0.335297,[171860],[0.0],"[[0.0, 0.0]]",[0],0.000137,0.004019,170.0,0.004909,0.5,0.330242,,
100,exploration,0.360960,[30523],[0.0],"[[0.0, 0.0]]",[0],0.000220,0.003930,170.0,0.005312,0.5,0.355169,,
