In [13]:


import numpy as np

from skactiveml.base import SingleAnnotatorPoolQueryStrategy
from skactiveml.utils import MISSING_LABEL, labeled_indices
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.base import ClusterMixin

In [14]:
class TypiClust(SingleAnnotatorPoolQueryStrategy):
    """ Typi Clust Selection

    This class implements various Typi Cluster query strategies [1], which considers
    both density and typicality of the samples.

    Parameters
    ----------
    missing_label: scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label
    random_state: int or np.random.RandomState
        The random state to use
    cluster_algo: class in sklearn.cluster (default=Kmeans)
            The cluster algorithm that to be used in the TypiClust
    k: int, optional (default=5)
            the number for knn by computation of typicality

    [1] G. Hacohen, A. Dekel, und D. Weinshall, „Active Learning on a Budget:
    Opposite Strategies Suit High and Low Budgets“, ICLR, 2022.
    """

    def __init__(
            self,
            missing_label=MISSING_LABEL,
            random_state=None,
            cluster_algo=KMeans,
            cluster_algo_param={},
            n_cluster_param_name="n_clusters",
            k=5
    ):
        super().__init__(
            missing_label=missing_label, random_state=random_state
        )

        self.cluster_algo = cluster_algo
        self.cluster_algo_param = cluster_algo_param
        self.n_cluster_param_name = n_cluster_param_name
        self.k = k

    def query(
            self,
            X,
            y,
            candidates=None,
            batch_size=1,
            return_utilities=False,
    ):
        """Query the next samples to be labeled

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples
        y: array-like of shape (n_samples, )
            Labels of the training data set (possibly including unlabeled ones
            indicated by self.missing_label)
        candidates: None or array-like of shape (n_candidates), dtype = int or
            array-like of shape (n_candidates, n_features), optional (default=None)
            If candidates is None, the unlabeled samples from (X,y) are considered
            as candidates
            If candidates is of shape (n_candidates) and of type int,
            candidates is considered as a list of the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the candidates are
            directly given in the input candidates (not necessarily contained in X)
        batch_size: int, optional(default=1)
            The number of samples to be selects in one AL cycle.
        return_utilities: bool, optional(default=False)
            If True, also return the utilities based on the query strategy

        Returns
        ----------
        query_indices: numpy.ndarry of shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            to queried, e.g., `query_indices[0]` indicates the first selected sample.
            If candidates in None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        utilities: numpy.ndarray of shape (batch_size, n_samples) or
            numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples for selecting each sample of the batch.
            Here, utilities means the typicality in the considering cluster.
            If candidates is None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        """
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )

        X_cand, mapping = self._transform_candidates(candidates, X, y)

        # Validate init parameter
        if not issubclass(self.cluster_algo, ClusterMixin):
            raise TypeError("Only clustering algorithm from super class sklearn.ClusterMixin is supported.")

        if not isinstance(self.k, int):
            raise TypeError("Only k as integer is supported.")

        if not isinstance(self.cluster_algo_param, dict):
            raise TypeError(
                "Please pass a dictionary with corresponding parameter name and value in the init function.")

        if not isinstance(self.n_cluster_param_name, str):
            raise TypeError("n_cluster_param_name supports only string.")

        selected_samples = labeled_indices(y, missing_label=self.missing_label)

        n_clusters = len(selected_samples) + batch_size
        cluster_algo_param = self.cluster_algo_param.copy()
        cluster_algo_param[self.n_cluster_param_name] = n_clusters
        cluster_obj = self.cluster_algo(**cluster_algo_param)

        if candidates is None:
            X_for_cluster = X
            selected_samples_X_c = selected_samples
        else:
            X_for_cluster = np.concatenate((X_cand, X[selected_samples]), axis=0)
            selected_samples_X_c = np.arange(len(X_cand), len(X_cand) + len(selected_samples))
        cluster_labels = cluster_obj.fit_predict(X_for_cluster)
        print(cluster_labels)

        cluster_ids, cluster_sizes = np.unique(cluster_labels, return_counts=True)
        

        covered_cluster = np.unique([cluster_labels[i] for i in selected_samples_X_c])

        if len(covered_cluster) > 0:
            cluster_sizes[covered_cluster] = 0

        if mapping is not None:
            utilities = np.full(shape=(batch_size, X.shape[0]), fill_value=np.nan)
        else:
            utilities = np.full(shape=(batch_size, X_cand.shape[0]), fill_value=np.nan)
            mapping = np.arange(len(X_cand))

        query_indices = []

        for i in range(batch_size):
            cluster_id = np.argmax(cluster_sizes)
            uncovered_samples_mapping = [idx for idx, value in enumerate(cluster_labels) if value == cluster_id]
            typicality = _typicality(X, uncovered_samples_mapping, self.k)
            idx = np.argmax(typicality)
            idx = mapping[idx]
            for index, value in enumerate(mapping):
                if value in query_indices:
                    utilities[i, value] = np.nan
                else:
                    utilities[i, value] = typicality[index]
            
            query_indices = np.append(query_indices, [idx])
            cluster_sizes[cluster_id] = 0

        if return_utilities:
            return query_indices, utilities
        else:
            return query_indices


def _typicality(X, uncovered_samples_mapping, k):
    typicality = np.zeros(shape=X.shape[0])
    dist_matrix = pairwise_distances(X[uncovered_samples_mapping])
    dist_matrix_sort_inc = np.sort(dist_matrix)
    knn = np.sum(dist_matrix_sort_inc[:, :k + 1], axis=1)
    typi = ((1/k) * knn) ** (-1)
    for idx, value in enumerate(uncovered_samples_mapping):
        typicality[value] = typi[idx]
    return typicality

In [15]:
X, y_true = make_blobs(n_samples=10, n_features=2,
                       centers=[[0, 1], [-3, .5], [-1, -1], [2, 1], [1, -.5]],
                       cluster_std=.7, random_state=42)
y_true = y_true % 2
y = np.full(shape=y_true.shape, fill_value=MISSING_LABEL)

[[ 0.45338198  2.0661209 ]
 [ 0.29101822 -0.28002687]
 [ 0.79255752  0.60639873]
 [-1.32439238 -1.32601083]
 [-1.32863207 -0.62020797]
 [ 0.36438315 -1.48861259]
 [ 0.34769991  0.90321499]
 [-1.89455103  1.03720431]
 [ 2.16937359 -0.33929617]
 [-3.16390736  0.33610413]]


In [19]:
qs = TypiClust(k=3)

In [24]:
X_cand, y_cand = make_blobs(n_samples=10, n_features=2,
                       centers=[[0, 1], [-3, .5], [-1, -1], [2, 1], [1, -.5]],
                       cluster_std=.7, random_state=2)
query_indices, utilites = qs.query(X, y, candidates=X_cand, batch_size=3, return_utilities=True)
print(utilites)

[1 2 2 1 1 1 0 2 0 0]
[[0.28277192 0.         0.         0.48137418 0.51438474 0.41927696
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.                nan 0.
  0.52063678 0.         0.39572899 0.33583409]
 [0.         0.84026544 0.80216074 0.                nan 0.
         nan 0.56890928 0.         0.        ]]


