In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
PROBABILITY_THRESHOLD = .5


class PositiveUnlabeledClassifier:
    def __init__(self, calibration_share=.4, svm_reg=.1):
        self.svm = None
        self.platt_scaler = None
        self.c = None
        self.calibration_share = calibration_share
        self.svm_reg = svm_reg
        
    def fit(self, x_train, s_train, x_val, s_val):
        x_train_svm, x_train_calibration, s_train_svm, s_train_calibration = \
            train_test_split(x_train, s_train, train_size=1 - self.calibration_share)
        class_weight = {i: np.mean(s_train_svm != i) for i in np.unique(s_train_svm)}
        self.svm = SVC(C=1 / self.svm_reg, class_weight=class_weight).fit(x_train_svm, s_train_svm)
        
        svm_df_calibration = self.svm.decision_function(x_train_calibration).reshape(-1, 1)
        self.platt_scaler = LogisticRegression().fit(svm_df_calibration, s_train_calibration)
        
        val_s_proba = self._predict_s_proba(x_val)
        self.c = val_s_proba[s_val > 0].mean()
    
    def _predict_s_proba(self, x):
        svm_df = self.svm.decision_function(x).reshape(-1, 1)
        return self.platt_scaler.predict_proba(svm_df)[:, 1]
    
    def predict_proba(self, x):
        return self._predict_s_proba(x) / self.c
    
    def predict(self, x):
        return self.predict_proba(x) > PROBABILITY_THRESHOLD

In [3]:
mnist_x, mnist_y = fetch_openml('mnist_784', return_X_y=True)
mnist_y = np.int32(mnist_y)

In [4]:
bin_minst_mask = mnist_y < 2
bin_mnist_x, bin_mnist_y = mnist_x[bin_minst_mask], mnist_y[bin_minst_mask]

In [5]:
REAL_C = .25


s_mask = np.random.binomial(1, REAL_C, bin_mnist_y.shape)
bin_mnist_s = bin_mnist_y * s_mask

In [7]:
N_FOLDS = 5


folds = np.random.randint(N_FOLDS, size=bin_mnist_y.shape)
cs, f1s = [], []
for index_test in range(N_FOLDS):
    index_val = (index_test + 1) % N_FOLDS
    val_mask = folds == index_val
    test_mask = folds == index_test
    train_mask = ~(val_mask | test_mask)
    
    x_train = bin_mnist_x[train_mask]
    x_val = bin_mnist_x[val_mask]
    x_test = bin_mnist_x[test_mask]
    
    s_train = bin_mnist_s[train_mask]
    s_val = bin_mnist_s[val_mask]
    
    y_test = bin_mnist_y[test_mask]
    
    pul = PositiveUnlabeledClassifier()
    pul.fit(x_train, s_train, x_val, s_val)
    
    cs.append(pul.c)
    f1 = f1_score(y_test, pul.predict(x_test))
    f1s.append(f1)
    print(f'Fold {index_test + 1}: c={pul.c} F1={f1}')
    
print(f'{N_FOLDS}-fold average c: {np.mean(cs)}, average F1: {np.mean(f1s)}')

Fold 1: c=0.2673770716327467 F1=0.9726071543667418
Fold 2: c=0.25218167150465876 F1=0.9753886010362693
Fold 3: c=0.24541346798156993 F1=0.9774011299435028
Fold 4: c=0.24521687262780292 F1=0.975
Fold 5: c=0.23959627773903044 F1=0.9640780020526856
5-fold average c: 0.24995707229716174, average F1: 0.97289497747984
