In [None]:
# add path to explore_by_example src folder
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


# import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# our system imports
from aideme.active_learning import *
from aideme.initial_sampling import StratifiedSampler
from aideme.explore import *
from aideme.utils import *

%matplotlib inline

In [None]:
def plot_fscore(metrics):
    df_list = [pd.DataFrame.from_dict({i: metric for i, metric in enumerate(ls)}, orient='index') for ls in metrics]
    avg = sum([df['fscore'][~df['fscore'].isna()] for df in df_list]) / len(df_list)
    avg.plot(ylim=[0,1], marker='o')
    plt.show()

In [None]:
# DUMMY DATA
X = np.random.uniform(low=-2, high=2, size=(100000, 2))  # do not forget to standardize the data. For this distribution, it should be fine without it.
y = np.logical_and(np.abs(X[:, 0]) < 0.065, np.abs(X[:, 1]) < 0.065).astype('float') # final labels: -0.2 < X[:,0] < 0.2 AND -0.2 < X[:,1] < 0.2
#y = np.linalg.norm(X, axis=1) < 0.075
y_subspace = np.vstack([np.abs(X[:, 0]) < 0.1, np.abs(X[:, 1]) < 0.1]).T.astype('float')  # partial labels (for each subspace)

# visualize data distribution
print('selectivity :', 100 * y.sum() / len(y), '%')

#plt.figure(figsize=(10,8))
#plt.scatter(X[:, 0], X[:, 1], s=0.5, c=['b' if lb else 'r' for lb in y])
#plt.show()

# NO FACTORIZATION EXAMPLE

In [None]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 150  # number of points to be labeled by the user
SUBSAMPLING = float('inf') #10000, 50000, float('inf')
INITIAL_SAMPLER = StratifiedSampler(pos=1, neg=1)  # start with one random positive sample and one random negative sample
CALLBACK = [classification_metrics('fscore'), three_set_metric]  # callback function to be called at the end of each iteration. Here, we compute the current f-score
CALLBACK_SKIP = 5
PRINT_CALLBACK_RESULT = True

explore = PoolBasedExploration(NUMBER_OF_ITERATIONS, INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT)

# CHOOSE AN ALGORITHM
#learner = RandomSampler(SVC(C=1e5, kernel='rbf'))  # choose a random point
learner = SimpleMargin(C=1024, kernel='rbf')  # choose point closest to SVM decision boundary
#learner = KernelQueryByCommittee(kernel='rbf', sampling='deterministic', n_samples=8, warmup=100, thin=10, rounding=True)  # version space algorithm
learner = DualSpaceModel(learner, sample_unknown_proba=0.5)  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, y, learner, repeat=1)  # 'repeat' specifies how many times to repeat the exploration process

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
plot_fscore(metrics)

# FACTORIZATION EXAMPLE

In [None]:
# SET-UP EXPLORATION CONFIGURATION
NUMBER_OF_ITERATIONS = 150  # number of points to be labeled by the user
SUBSAMPLING = float('inf') #10000, 50000, float('inf')
INITIAL_SAMPLER = StratifiedSampler(pos=1, neg=1)  # start with one random positive sample and one random negative sample
CALLBACK = [classification_metrics('fscore'), three_set_metric]  # callback function to be called at the end of each iteration. Here, we compute the current f-score
CALLBACK_SKIP = 5
PRINT_CALLBACK_RESULT = True

explore = PoolBasedExploration(NUMBER_OF_ITERATIONS, INITIAL_SAMPLER, SUBSAMPLING, CALLBACK, CALLBACK_SKIP, PRINT_CALLBACK_RESULT)

# SUBSPATIAL LEARNER - run Simple Margin in each subspace
#subspatial_active_learners = [
#    KernelQueryByCommittee(kernel='rbf', n_samples=8, warmup=100, thin=10, sampling='deterministic'),
#    KernelQueryByCommittee(kernel='rbf', n_samples=8, warmup=100, thin=10, sampling='deterministic'),
#]

#learner = SubspaceLearner(
#    partition=[[0], [1]],  # partition of attributes (one subspace with the attribute 0, and another with 1)
#    learners=subspatial_active_learners, 
#    #label_function='AND', probability_function='min', ranking_function='square'
#)

# FACTORIZED DSM
learner = DualSpaceModel(SimpleMargin(C=1024, kernel='rbf'), sample_unknown_proba=0.5, partition=[[0], [1]])  # Dual Space model


# RUN EXPLORATION
metrics = explore.run(X, y_subspace, learner, repeat=1)  # use y_subspace here

# COMPUTE AVERAGE F-SCORE OVER ALL REPEATS AND PLOT
plot_fscore(metrics)