In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pylab as plt

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

from sklearn.preprocessing import StandardScaler



In [3]:
from skactiveml.classifier import IgnUnlabeledClassifier
### CONSTANTS
# the label for an unlabeled instances and the corresponding functions are defined here

global UNL
UNL = np.nan

is_unlabeled = lambda y : IgnUnlabeledClassifier(None, UNL).is_unlabeled(y)
is_labeled = IgnUnlabeledClassifier(None, UNL).is_labeled

In [4]:
### DATASETS

from sklearn.datasets import make_blobs

def create_2d_data_set(seed=42):
    X, y = make_blobs(n_samples=200, n_features=2, centers=12, cluster_std=1, random_state=seed)
    y = y % 2
    X = StandardScaler().fit_transform(X)
    return X, y


In [5]:
### CLASSIFIER
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [6]:
clf_dict = {'GaussianNaiveBayes' : IgnUnlabeledClassifier(GaussianNB(), UNL),
            'DecisionTree' : IgnUnlabeledClassifier(DecisionTreeClassifier(), UNL),
            'LinearSVC' : IgnUnlabeledClassifier(SVC(probability=True), UNL)}

In [7]:
### SAMPLING STRATEGIES
import skactiveml.pool as skacmlp

query_strategies = {}
for qs_name in skacmlp.__all__:
    query_strategies[qs_name] = getattr(skacmlp, qs_name)

In [8]:
### AL CYCLE

def get_labels_with_selector(X, y, y_oracle, clf, selector, budget=30):  
    
    for b in range(budget):
        unlabeled = np.where(is_unlabeled(y))[0]
        #clf.fit(X, y)
        unlabeled_id = selector.query(X[unlabeled])
        sample_id = unlabeled[unlabeled_id]
        y[sample_id] = y_oracle[sample_id]

    return y

In [9]:
# minimal example

clf = IgnUnlabeledClassifier(GaussianNB(), UNL)

selector = query_strategies['RandomSampler']()

X,y_oracle = create_2d_data_set()
y = np.full(y_oracle.shape, UNL)

y = get_labels_with_selector(X, y, y_oracle, clf, selector, budget=30)

print(y)
print(is_labeled(y))

[ 1.  0. nan nan nan nan nan nan  1. nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan  0.  1.  1. nan
 nan nan nan nan nan nan nan  1. nan nan nan nan nan nan nan nan nan nan
 nan nan nan  0. nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan  1. nan nan nan nan  0. nan nan  1.  0. nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan  1. nan nan nan nan  0. nan nan nan nan nan nan nan nan  1.  0. nan
  0. nan  1. nan nan  0. nan  0.  0. nan nan nan nan nan nan  1. nan nan
  1. nan nan  0. nan nan  0. nan nan nan nan nan  0. nan nan  1. nan  1.
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan  1.
 nan nan nan nan nan nan nan nan nan nan nan  0. nan nan nan nan nan nan
 nan nan]
[ True  True False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False Fals

In [10]:
def plot_scores_2d(X, y, y_oracle, clf, selector, res=21):
    # create mesh for plotting
    x_1_vec = np.linspace(min(X[:, 0]), max(X[:, 0]), res)
    x_2_vec = np.linspace(min(X[:, 1]), max(X[:, 1]), res)
    X_1_mesh, X_2_mesh = np.meshgrid(x_1_vec, x_2_vec)
    X_mesh = np.array([X_1_mesh.reshape(-1), X_2_mesh.reshape(-1)]).T
    
    # compute gains
    clf.fit(X, y)
    posteriors = clf.predict_proba(X_mesh)[:,0].reshape(X_1_mesh.shape)
    
    # compute gains
    _, scores = selector.query(X_mesh, return_utilities=True)
    scores = scores.reshape(X_1_mesh.shape)
    
    # get indizes for plotting
    labeled_indices = np.where(is_labeled(y))[0]
    unlabeled_indices = np.where(is_unlabeled(y))[0]
    
    # setup figure
    fig = plt.figure(figsize=(10, 8))
    plt.xlim(min(X[:, 0]), max(X[:, 0]))
    plt.ylim(min(X[:, 1]), max(X[:, 1]))
    plt.xlabel(r'$x_1$')
    plt.ylabel(r'$x_2$')
    cmap = plt.get_cmap('coolwarm')
    
    plt.scatter(X[labeled_indices, 0], X[labeled_indices, 1], c=[[.2, .2, .2]], s=90, marker='o', zorder=3.8)
    plt.scatter(X[labeled_indices, 0], X[labeled_indices, 1], c=[[.8, .8, .8]], s=60, marker='o', zorder=4)
    for cl, marker in zip([0,1],['D','s']):
        cl_labeled_idx = labeled_indices[y[labeled_indices] == cl]
        cl_unlabeled_idx = unlabeled_indices[y_oracle[unlabeled_indices]==cl]
        plt.scatter(X[cl_labeled_idx, 0], X[cl_labeled_idx, 1], c=np.ones(len(cl_labeled_idx))*cl, marker=marker, vmin=-0.2, vmax=1.2, cmap='coolwarm', s=20, zorder=5)
        plt.scatter(X[cl_unlabeled_idx, 0], X[cl_unlabeled_idx, 1], c=np.ones(len(cl_unlabeled_idx)) * cl, marker=marker, vmin=-0.2, vmax=1.2, cmap='coolwarm', s=20, zorder=3)
        plt.scatter(X[cl_unlabeled_idx, 0], X[cl_unlabeled_idx, 1], c='k', marker=marker, vmin=-0.1, vmax=1.1, cmap='coolwarm', s=30, zorder=2.8)
    
    CS = plt.contourf(X_1_mesh, X_2_mesh, scores, cmap='Greens', alpha=.75)
    CS = plt.contour(X_1_mesh, X_2_mesh, posteriors, [.5], colors='k', linewidths=[2], zorder=1)
    CS = plt.contour(X_1_mesh, X_2_mesh, posteriors, [.25,.75], cmap='coolwarm_r', linewidths=[2,2], 
                     zorder=1, linestyles='--', alpha=.9, vmin=.2, vmax=.8)
    
    fig.tight_layout()
    plt.show()
    
    return fig
    

In [11]:
def plot_gain_data_set_2d(clf, al, usefulness, budget, seed=43, n_samples=250):
    
    # create data set
    X, y_oracle = create_2d_data_set(seed = seed)
    X = StandardScaler().fit_transform(X)
    y = np.full(y_oracle.shape, UNL)
    
    # classifier
    clf = clf_dict[clf]
        
    # AL cycle
    y = get_labels_with_selector(X, y, y_oracle, clf, query_strategies[al](), budget=budget)
    
    # plot gains and decision boundary
    fig = plot_scores_2d(X, y, y_oracle, clf, query_strategies[usefulness]())

In [12]:

budget_slider = widgets.IntSlider(value=15, min=1, max=250, step=1)
clf_slider = widgets.Dropdown(options=clf_dict.keys())
al_slider = widgets.Dropdown(options=query_strategies.keys())
usefulness_slider = widgets.Dropdown(options=query_strategies.keys())

In [13]:
interact(plot_gain_data_set_2d, clf=clf_slider,
         al=al_slider, usefulness=usefulness_slider, budget=budget_slider)


interactive(children=(Dropdown(description='clf', options=('GaussianNaiveBayes', 'DecisionTree', 'LinearSVC'),…

<function __main__.plot_gain_data_set_2d(clf, al, usefulness, budget, seed=43, n_samples=250)>