PIG method

In [1]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Set the percentage of positive examples to select
percentages = [20, 30, 40, 50]

# Initialize the F1-score array
f1_scores = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Calculate the class prior using cross-validation
    model = LogisticRegression()
    class_prior_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring="precision")

    # Calculate the average class prior across all folds
    class_prior = np.mean(class_prior_scores)

    print(f"Class prior for {percent}% of positive examples: {class_prior:.4f}")

    # Perform PIG learning using the class prior
    y_pig = y_selected / class_prior

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_pig, np.zeros(X_unlabeled.shape[0])])

    # Split the combined data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    from sklearn.preprocessing import LabelEncoder

    # Encode the target variable y
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)

    # Train a logistic regression model on the training data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Make predictions on the validation data
    y_pred = model.predict(X_val)

    # Calculate the F1-score and store it in the array
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)

# Print the F1-scores for each percentage
for i, percent in enumerate(percentages):
    print(f"F1-score for {percent}% of positive examples: {f1_scores[i]:.4f}")

Class prior for 20% of positive examples: 0.0000
Class prior for 30% of positive examples: 0.0000
Class prior for 40% of positive examples: 0.1667
Class prior for 50% of positive examples: 0.7100
F1-score for 20% of positive examples: 0.0000
F1-score for 30% of positive examples: 0.0000
F1-score for 40% of positive examples: 0.0769
F1-score for 50% of positive examples: 0.2222


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  y_pig = y_selected / class_prior
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  y_pig = y_selected / class_prior
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


To perform the CAL (Class prior Adjusted Labeling) method for PU learning with the calculated class priors, we need to adjust the probability threshold of the logistic regression classifier based on the class prior. Here's how we can modify the existing code to incorporate the CAL method:

In [2]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Set the percentage of positive examples to select
percentages = [20, 30, 40, 50]

# Initialize the F1-score array
f1_scores = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Calculate the class prior using cross-validation
    model = LogisticRegression()
    class_prior_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring="precision")
    class_prior = np.mean(class_prior_scores)

    print(f"Class prior for {percent}% of positive examples: {class_prior:.4f}")

    # Train a logistic regression classifier on the combined dataset
    model.fit(X_combined, y_combined)

    # Calculate the predicted probabilities for the unlabeled examples
    proba_unlabeled = model.predict_proba(X_unlabeled)[:, 1]

    # Adjust the probability threshold based on the class prior
    threshold = class_prior / (1 - class_prior)

    # Label the unlabeled examples based on the adjusted probability threshold
    y_pred = (proba_unlabeled >= threshold).astype(int)

    # Create an array with labels for all positive examples
    y_true_all = np.concatenate([np.ones(X_positive.shape[0]), np.zeros(X_unlabeled.shape[0])])

    # Select the labels for the selected positive examples
    y_true = y_true_all[selected_indices]

    # Predict the labels for all positive examples
    y_pred_all = np.concatenate([model.predict(X_positive), np.zeros(X_unlabeled.shape[0])])

    # Select the predictions for the selected positive examples
    y_pred = y_pred_all[selected_indices]

    # Calculate the F1-score for the labeled examples
    f1 = f1_score(y_true, y_pred, pos_label=1)

    # Add the F1-score to the array
    f1_scores.append(f1)

    print(f"F1-score for {percent}% of positive examples: {f1:.4f}")



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Class prior for 20% of positive examples: 0.0000
F1-score for 20% of positive examples: 0.0000
Class prior for 30% of positive examples: 0.5000
F1-score for 30% of positive examples: 0.0488
Class prior for 40% of positive examples: 0.2500
F1-score for 40% of positive examples: 0.0893
Class prior for 50% of positive examples: 0.6944
F1-score for 50% of positive examples: 0.2368


  _warn_prf(average, modifier, msg_start, len(result))


The method modification suggested by Bekker & Davis for PU learning involves modifying the class prior to incorporate the uncertainty about the true class labels of the unlabeled examples. Specifically, the modified class prior is given by:

p_c = (1 - alpha) * p(y=c | labeled) + alpha * p(y=c)

where p(y=c | labeled) is the empirical class frequency in the labeled examples, p(y=c) is the class prior estimated from the unlabeled and labeled examples, and alpha is a parameter that controls the amount of uncertainty incorporated from the unlabeled examples.

In [3]:
# Initialize the alpha value
alpha = 0.5

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Calculate the empirical class frequency in the labeled examples
    p_labeled = np.mean(y_selected)

    # Calculate the class prior using cross-validation
    model = LogisticRegression()
    class_prior_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring="precision")

    # Calculate the average class prior across all folds
    p_unlabeled = np.mean(class_prior_scores)

    # Calculate the modified class prior
    p_c = (1 - alpha) * p_labeled + alpha * p_unlabeled

    print(f"Modified class prior for {percent}% of positive examples: {p_c:.4f}")

    # Train a logistic regression model with the modified class prior
    model = LogisticRegression(class_weight={1: p_c, 0: 1 - p_c})
    model.fit(X_combined, y_combined)

    # Predict the labels for all positive examples
    y_pred_all = np.concatenate([model.predict(X_positive), np.zeros(X_unlabeled.shape[0])])

    # Select the predictions for the selected positive examples
    y_pred = y_pred_all[selected_indices]

    # Create an array with labels for all positive examples
    y_true_all = np.concatenate([np.ones(X_positive.shape[0]), np.zeros(X_unlabeled.shape[0])])

    # Select the labels for the selected positive examples
    y_true = y_true_all[selected_indices]

    # Calculate the F1-score for the labeled examples
    f1 = f1_score(y_true, y_pred, pos_label=1)

    # Add the F1-score to the array
    f1_scores.append(f1)

    print(f"F1-score for {percent}% of positive examples: {f1:.4f}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Modified class prior for 20% of positive examples: 0.5000
F1-score for 20% of positive examples: 0.0000
Modified class prior for 30% of positive examples: 0.6000
F1-score for 30% of positive examples: 0.0952


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Modified class prior for 40% of positive examples: 0.6500
F1-score for 40% of positive examples: 0.2459
Modified class prior for 50% of positive examples: 0.8381
F1-score for 50% of positive examples: 0.9150


In this code, we first initialize the alpha parameter to 0.5, which means that we are equally uncertain about the true class labels of the unlabeled examples and the empirical class frequency in the labeled examples. We then calculate the modified class prior p_c using this value of alpha, and train a logistic regression model with this modified class prior. Finally, we predict the labels for all positive examples, select the labels and predictions for the selected positive examples, and calculate the F1-score for these labeled examples.

the codes below are to be ignored


In [4]:
# Initialize the alpha value
alpha = 0.5

X_all = np.concatenate((X_positive, X_unlabeled), axis=0)


# Find all examples that are not in the positive set
X_negative = np.array([x for x in X_all if x not in X_positive])
y_negative = np.zeros(X_negative.shape[0])


# Initialize the labeled set with all positive examples
X_labeled = X_positive.copy()
y_labeled = np.ones(X_positive.shape[0])

# Initialize the unlabeled set with all negative examples
X_unlabeled = X_negative.copy()
y_unlabeled = np.zeros(X_negative.shape[0])

# Initialize the F1-scores array
f1_scores = []

# Initialize the stopping criterion
stop = False

while not stop:
    # Combine the labeled and unlabeled sets
    X_combined = np.concatenate([X_labeled, X_unlabeled])
    y_combined = np.concatenate([y_labeled, y_unlabeled])

    # Calculate the empirical class frequency in the labeled examples
    p_labeled = np.mean(y_labeled)

    # Calculate the class prior using cross-validation
    model = LogisticRegression()
    class_prior_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring="precision")

    # Calculate the average class prior across all folds
    p_unlabeled = np.mean(class_prior_scores)

    # Calculate the modified class prior
    p_c = (1 - alpha) * p_labeled + alpha * p_unlabeled

    # Train a logistic regression model with the modified class prior
    model = LogisticRegression(class_weight={1: p_c, 0: 1 - p_c})
    model.fit(X_combined, y_combined)

    # Predict the labels for all examples
    y_pred_all = model.predict(X_combined)

    # Calculate the F1-score for the labeled examples
    f1 = f1_score(y_labeled, y_pred_all[:y_labeled.shape[0]], pos_label=1)

    # Add the F1-score to the array
    f1_scores.append(f1)

    print(f"F1-score for {X_labeled.shape[0]} labeled examples: {f1:.4f}")

    # Stop if the F1-score has plateaued or if the labeled set contains all positive examples
    if len(f1_scores) > 1 and f1_scores[-1] == f1_scores[-2]:
        stop = True
    elif X_labeled.shape[0] == X_positive.shape[0]:
        stop = True

    # Rank the unlabeled examples by their predicted probability of being positive
    proba_all = model.predict_proba(X_unlabeled)[:, 1]
    indices = np.argsort(proba_all)[::-1]

    # Select the most confident examples to add to the labeled set
    num_examples = int(alpha * (X_labeled.shape[0] + X_unlabeled.shape[0]))
    selected_indices = indices[:num_examples]
    X_selected = X_unlabeled[selected_indices]
    y_selected = np.ones(num_examples)

    # Remove the selected examples from the unlabeled set
    X_unlabeled = np.delete(X_unlabeled, selected_indices, axis=0)
    y_unlabeled = np.delete(y_unlabeled, selected_indices)

    # Add the selected examples to the labeled set
    X_labeled = np.concatenate([X_labeled, X_selected])
    y_labeled = np.concatenate([y_labeled, y_selected])


F1-score for 268 labeled examples: 0.9771
