In [None]:
# default_exp transforms

In [None]:
# export
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm.notebook import tqdm

In [None]:
# export
def prepFeatures(X):
    "Apply z-score normalization to nxd feature matrix"
    ss = StandardScaler(with_mean=True, with_std=True)
    Xz = ss.fit_transform(X)
    return Xz,ss

In [None]:
# export
def trainOOBClassifier(X,y, modelFactory=lambda: DecisionTreeClassifier(),n_estimators=100):
    """
    Train ensemble of <n_estimators> models predicting the probability that each
    instance came from the labeled positive, rather than the unlabeled mixture, set.
    
    Required Arguments:
        - X : ndarray shape (n,d) : feature matrix
        - y : ndarray shape (n,)  : positive v. unlabeled component assignments for each instance
    Optional Arguments:
        - modelFactory : lambda function returning sklearn-style model instance (has fit, fit_predict, predict_proba, ... functions) : default DicisionTreeRegressor
        - n_estimators : size of the ensemble : default 100
        
    Returns
        - transform_scores : ndarray (n,) : probability that each instance came from labeled positive set, calculating using out-of-bag scores
        - auc_pu : float : the AUROC of this non-traditional classifier
    """
    # z-score normalization is applied to the whole dataset prior to training
    X,ss = prepFeatures(X)
    clf = BaggingClassifier(n_jobs=-1,base_estimator=modelFactory(), n_estimators=n_estimators,
                            max_samples=X.shape[0],max_features=X.shape[1], bootstrap=True,
                            bootstrap_features=False, oob_score=True).fit(X,y)
    transform_scores = clf.oob_decision_function_[:,1]
    auc_pu = roc_auc_score(y, transform_scores)
    return transform_scores, auc_pu

In [None]:
# export
def trainKFoldClassifier(X,y, modelFactory=lambda: SVC(probability=True, degree=1),KFoldValue=10):
    """
    Train model using K-fold cross-validation
    Required Arguments:
        - X : ndarray shape (n,d) : feature matrix
        - y : ndarray shape (n,)  : positive v. unlabeled component assignments for each instance
    Optional Arguments:
        - modelFactory : lambda function returning sklearn-style model instance (has fit, fit_predict, predict_proba, ... functions) : default SVC
        - KFoldValue : number of folds to use in k-fold cross-validation : default 10
        
    Returns
        - transform_scores : ndarray (n,) : probability that each instance came from labeled positive set
        - auc_pu : float : the AUROC of this non-traditional classifier

    """
    transform_scores = np.zeros(y.shape, dtype=float)
    # z-score normalization applied globally rather than within each k-fold iteration
    X,ss = prepFeatures(X)
    kf = KFold(n_splits=KFoldValue, shuffle=False)
    for train_indices, val_indices in kf.split(X):
        X_train, y_train = X[train_indices], y[train_indices]
        X_val = X[val_indices]
        clf = modelFactory()
        clf.fit(X_train, y_train)
        transform_scores[val_indices] = clf.predict_proba(X_val)[:,1]
    auc_pu = roc_auc_score(y, transform_scores)
    return transform_scores, auc_pu

Test k-fold and oob transform functions

In [None]:
# hide
from sklearn.datasets import load_wine
X,y = load_wine(return_X_y=True)
y = y == 1

transform_scores, auc_pu = trainOOBClassifier(X,y,modelFactory=lambda: DecisionTreeClassifier())

auc_pu

transform_scores, auc_pu = trainKFoldClassifier(X,y)

auc_pu

In [None]:
# export
def getOptimalTransform(X,y):
    """
    Train the 6 univariate transforms from (Zeiberg 2020) and return the transform scores and auc_pu for the best transform
    
    Required Arguments:
        - X : ndarray shape (n,d) : feature matrix
        - y : ndarray shape (n,)  : positive v. unlabeled component assignments for each instance
    Returns:
        - transform_scores : ndarray (n,) : probability that each instance came from labeled positive set
        - auc_pu : float : the AUROC of this non-traditional classifier
    """
    transform_scores, auc_pu = {},{}
    models = [("nn_1",lambda: MLPClassifier(hidden_layer_sizes=(1,1)), 100),
              ("nn_5",lambda: MLPClassifier(hidden_layer_sizes=(1,1)), 100),
              ("nn_25",lambda: MLPClassifier(hidden_layer_sizes=(1,1)), 100),
              ("rt",lambda: DecisionTreeClassifier(), 1000),
              ("svm_1",lambda: SVC(kernel="poly", degree=1, probability=True),10),
              ("svm_2",lambda: SVC(kernel="poly", degree=1, probability=True),10)]
    for model_name, model_factory, n in tqdm(models,total=len(models),desc="Training univariate transforms"):
        if "svm" in model_name:
            scores, auc = trainKFoldClassifier(X,y,modelFactory=model_factory,KFoldValue=n)
        else:
            scores, auc = trainOOBClassifier(X,y,modelFactory=model_factory, n_estimators=n)
        transform_scores[model_name] = scores
        auc_pu[model_name] = auc
    # Find the best transform
    best_auc = .5
    best_transform = "rt"
    for model_name, auc in auc_pu.items():
        if auc > best_auc:
            best_transform = model_name
            best_auc = auc
    return transform_scores[best_transform], auc_pu[best_transform]

In [None]:
# hide
getOptimalTransform(X,y)

HBox(children=(FloatProgress(value=0.0, description='Training univariate transforms', max=6.0, style=ProgressS…




(array([1.07515643e-03, 3.60473235e-02, 6.55373493e-03, 3.83785950e-06,
        6.81979083e-02, 4.70577594e-05, 1.19391382e-04, 1.58549760e-04,
        1.32628179e-03, 2.12317286e-03, 1.04722689e-03, 2.00573312e-03,
        1.68003911e-03, 1.31197355e-03, 2.51206510e-05, 2.58612120e-04,
        1.08824953e-04, 2.55565139e-03, 1.02574625e-04, 7.49275296e-03,
        7.27393431e-02, 1.44502935e-01, 2.40942095e-01, 4.77789304e-01,
        5.75741165e-01, 4.47679771e-01, 1.06029002e-02, 9.60032412e-02,
        1.35980309e-01, 6.18680163e-02, 3.53915213e-02, 2.94314925e-03,
        4.16966501e-01, 1.76887103e-02, 6.16097460e-02, 2.39477871e-01,
        8.50807299e-03, 5.20566248e-02, 6.77669266e-01, 4.63120650e-04,
        8.98054684e-02, 6.54562839e-02, 6.33869719e-04, 1.29062791e-01,
        4.72159644e-01, 2.15235928e-04, 3.48517071e-03, 2.66736987e-02,
        5.75185414e-03, 7.15878198e-04, 7.19519092e-02, 2.60933711e-03,
        9.66662216e-04, 1.56537969e-04, 1.86453116e-03, 5.844158