In [None]:
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import euclidean_distances, check_pairwise_arrays
from sklearn.utils.validation import check_array, check_X_y
from sklearn.utils.extmath import safe_sparse_dot

In [None]:
def gauss_kernel(x, y=None, gamma=None):
    x, y = check_pairwise_arrays(x, y)
    if gamma is None:
        gamma = 1.0 / x.shape[1]

    k = euclidean_distances(x, y, squared=True)
    k *= -gamma
    np.exp(k, k)

    return k

In [None]:
class BaseSemi(object):
    """Base class for semi-supervised learning model."""

    def __init__(self, gamma=20, alpha=1, max_iter=30, tol=1e-3):
        self.max_iter = max_iter
        self.tol = tol
        self.gamma = gamma
        self.alpha = alpha

        self.X = None
        self.classes = None
        self.label_distributions = None
        self.transduction = None

    def _get_kernel(self, x, y=None):
        if y is None:
            return gauss_kernel(x, x, gamma=self.gamma)
        else:
            return gauss_kernel(x, y, gamma=self.gamma)

    def _build_graph(self):
        raise NotImplementedError("Graph construction must be implemented")

    def predict(self, x):
        probas = self.predict_proba(x)

        return self.classesx[np.argmax(probas, axis=1)].ravel()

    def predict_proba(self, x):
        x_2d = check_array(x, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'])

        weight_matrices = self._get_kernel(self.X, x_2d).T

        probabilities = np.dot(weight_matrices, self.label_distributions)
        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
        probabilities /= normalizer

        return probabilities

    def fit(self, x, y):
        """Fit a semi-supervised model."""

        x, y = check_X_y(x, y)
        self.X = x

        # graph construction
        graph_matrix = self._build_graph()

        # label construction
        classes = np.unique(y)
        classes = (classes[classes != -1])
        self.classes = classes

        alpha = self.alpha
        y = np.asarray(y)
        unlabeled = y == -1

        n_samples, n_classes = len(y), len(classes)
        # initialize distributions
        self.label_distributions = np.zeros((n_samples, n_classes))
        for label in classes:
            self.label_distributions[y == label, classes == label] = 1

        y_static = np.copy(self.label_distributions)
        if self._variant == 'GRF':
            y_static[unlabeled] = 0
        else:
            y_static *= 1 - alpha

        l_previous = np.zeros((self.X.shape[0], n_classes))

        unlabeled = unlabeled[:, np.newaxis]
        if sparse.isspmatrix(graph_matrix):
            graph_matrix = graph_matrix.tocsr()

        for self.n_iter_ in range(self.max_iter):
            if np.abs(self.label_distributions - l_previous).sum() < self.tol:
                break

            l_previous = self.label_distributions
            self.label_distributions = safe_sparse_dot(graph_matrix, self.label_distributions)
            if self._variant == 'GRF':
                normalizer = np.sum(self.label_distributions, axis=1)[:, np.newaxis]
                self.label_distributions /= normalizer
                self.label_distributions = np.where(unlabeled, self.label_distributions, y_static)
            else:
                self.label_distributions = np.multiply(alpha, self.label_distributions) + y_static
        else:
            self.n_iter_ += 1

        normalizer = np.sum(self.label_distributions, axis=1)[:, np.newaxis]
        self.label_distributions /= normalizer

        # set the transduction item
        transduction = self.classes[np.argmax(self.label_distributions, axis=1)]
        self.transduction = transduction.ravel()

        return self

In [None]:
class GRFSemi(BaseSemi):
    """ References
    Zhu, Xiaojin, Zoubin Ghahramani, and John D. Lafferty.
    "Semi-supervised learning using gaussian fields and harmonic functions."
    Proceedings of the 20th International conference on Machine learning (ICML-03). 2003.
    """

    _variant = 'GRF'

    def __init__(self, gamma=20, alpha=None, max_iter=1000, tol=1e-3):
        super(GRFSemi, self).__init__(gamma=gamma, alpha=alpha, max_iter=max_iter, tol=tol)

    def _build_graph(self):
        affinity_matrix = self._get_kernel(self.X)
        normalizer = affinity_matrix.sum(axis=0)
        if sparse.isspmatrix(affinity_matrix):
            affinity_matrix.data /= np.diag(np.array(normalizer))
        else:
            affinity_matrix /= normalizer[:, np.newaxis]

        return affinity_matrix

    def fit(self, x, y):
        return super(GRFSemi, self).fit(x, y)

In [None]:
class LLGCSemi(BaseSemi):
    """ References
    Zhou, Dengyong, et al. "Learning with local and global consistency."
    Advances in neural information processing systems. 2004.
    """

    _variant = 'LLGC'

    def __init__(self, gamma=20, alpha=0.2, max_iter=30, tol=1e-3):
        super(LLGCSemi, self).__init__(gamma=gamma, alpha=alpha, max_iter=max_iter, tol=tol)

    def _build_graph(self):
        n_samples = self.X.shape[0]
        affinity_matrix = self._get_kernel(self.X)

        laplacian = sparse.csgraph.laplacian(affinity_matrix, normed=True)
        laplacian = -laplacian
        if sparse.isspmatrix(laplacian):
            diag_mask = (laplacian.row == laplacian.col)
            laplacian.data[diag_mask] = 0.0
        else:
            laplacian.flat[::n_samples + 1] = 0.0

        return laplacian

    def fit(self, x, y):
        return super(LLGCSemi, self).fit(x, y)