In [9]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

import numpy as np
from sklearn.model_selection import train_test_split

# Set the percentage of positive examples to select
percentages = [20, 30, 40, 50]

# Initialize the F1-score array
f1_scores = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Calculate the class prior for the selected positive examples
    class_prior = np.mean(y_selected)

    # Perform PIG learning using the class prior
    y_pig = y_selected / class_prior
    print(f"Class prior for {percent}% of positive examples: {class_prior:.4f}")
    
    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_pig, np.zeros(X_unlabeled.shape[0])])

    # Split the combined data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    # Train a logistic regression model on the training data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Make predictions on the validation data
    y_pred = model.predict(X_val)

    # Calculate the F1-score and store it in the array
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)

# Print the F1-scores for each percentage
for i, percent in enumerate(percentages):
    print(f"F1-score for {percent}% of positive examples: {f1_scores[i]:.4f}")

 


Class prior for 20% of positive examples: 1.0000
Class prior for 30% of positive examples: 1.0000
Class prior for 40% of positive examples: 1.0000
Class prior for 50% of positive examples: 1.0000
F1-score for 20% of positive examples: 0.0000
F1-score for 30% of positive examples: 0.1111
F1-score for 40% of positive examples: 0.0000
F1-score for 50% of positive examples: 0.1667


This code modifies the previous code to perform PIG learning using the calculated class prior. It divides the selected positive examples by the class prior to obtain the PIG labels. It then combines the PIG labels with the unlabeled examples, splits the combined data into training and validation sets, trains a logistic regression model on the training data, makes predictions on the validation data, and calculates the F1-score.

The code stores the F1-scores in an array and prints them for each percentage. The F1-score is a measure of the model's accuracy that takes into account both precision and recall, and it is commonly used in binary classification problems.

Note that the code assumes that the positive examples are labeled with 1 and the unlabeled examples are labeled with 0. If this is not the case, you may need to modify the code

To perform post-processing using CAL, we need to first train a classifier on the training set using PU learning with PIG. We will then use the trained classifier to predict the probabilities of each sample in the validation set being positive. We will then apply the class prior to these probabilities to obtain the corrected probabilities. Finally, we will use a threshold to convert these probabilities into binary labels and evaluate the F1-score.

Here's the code to perform post-processing using CAL:

In [10]:
# Set the threshold for converting probabilities to binary labels
threshold = 0.5

# Initialize empty lists to store the F1-scores and class priors for each percentage of positive examples
f1_scores_cal = []
class_priors = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Calculate the class prior for the selected positive examples
    class_prior = np.mean(y_selected)
    class_priors.append(class_prior)

    # Perform PIG learning on the selected positive examples and the unlabeled examples
    y_combined_pig_new = np.zeros(X_combined.shape[0])
    y_combined_pig_new[:num_examples] = 1
    pig_classifier = LogisticRegression(random_state=42)
    pig_classifier.fit(X_combined, y_combined_pig_new)

    # Use the trained PIG classifier to predict the probabilities of each sample in the validation set being positive
    y_val_prob = pig_classifier.predict_proba(X_val)[:, 1]

    # Apply the class prior to the probabilities to obtain the corrected probabilities
    y_val_prob_cal = y_val_prob / class_prior

    # Convert the probabilities to binary labels using the threshold
    y_val_pred = (y_val_prob_cal >= threshold).astype(int)

    # Calculate the F1-score
    f1_score_cal = f1_score(y_val, y_val_pred)

    # Print the F1-score
    print(f"F1-score for {percent}% of positive examples using CAL: {f1_score_cal:.4f}, Class prior: {class_prior:.4f}")

    # Append the F1-score to the list of F1-scores
    f1_scores_cal.append(f1_score_cal)


F1-score for 20% of positive examples using CAL: 0.0000, Class prior: 1.0000
F1-score for 30% of positive examples using CAL: 0.0667, Class prior: 1.0000
F1-score for 40% of positive examples using CAL: 0.1250, Class prior: 1.0000
F1-score for 50% of positive examples using CAL: 0.2632, Class prior: 1.0000


This code sets a threshold for converting probabilities to binary labels, initializes an empty list to store F1-scores, and iterates over selected percentages of positive examples. For each percentage, it selects a random subset of positive examples, calculates the class prior, performs PIG learning on the selected positive examples and the unlabeled examples, uses the trained PIG classifier to predict the probabilities of each sample in the validation set being positive, applies the class prior to the probabilities to obtain the corrected probabilities, converts the probabilities to binary labels using the threshold, calculates the F1-score, prints the F1-score, and appends the F1-score to the list of F1-scores.

This code implements a method called "Confidence-Aware Learning" (CAL), which is a variant of Pseudo-Labeling that incorporates a correction factor based on the estimated class prior. CAL is designed to address the issue of class imbalance that can occur when training a classifier on a dataset with a small number of positive examples.

Overall, this code evaluates the performance of the PIG classifier with the CAL correction factor on a validation set for different percentages of positive examples in the training set.


This code will print the F1-score for each percentage of positive examples using CAL and store the F1-scores in the f1_scores_cal list.

To perform method modification using the Bekker and Davis (BD) method, we need to first train a classifier on the training set using PU learning with PIG. We will then use the trained classifier to predict the probabilities of each sample in the validation set being positive. We will then apply the class prior to these probabilities to obtain the corrected probabilities. We will then modify these probabilities using the BD method, which involves scaling the probabilities by a factor that depends on the class prior and a tuning parameter alpha. Finally, we will use a threshold to convert these probabilities into binary labels and evaluate the F1-score.

Here's the code to perform method modification using the BD method:

In [11]:
# Set the threshold for converting probabilities to binary labels
threshold = 0.5

# Set the tuning parameter alpha
alpha = 1

# Initialize an empty list to store the F1-scores for each percentage of positive examples
f1_scores_bd = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Calculate the class prior for the selected positive examples
    class_prior = np.mean(y_selected)

    # Perform PIG learning on the selected positive examples and the unlabeled examples
    y_combined_pig = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])
    X_combined_pig = np.concatenate([X_selected, X_unlabeled])
    pig_classifier = LogisticRegression(random_state=42)
    pig_classifier.fit(X_combined_pig, y_combined_pig)


    # Use the trained PIG classifier to predict the probabilities of each sample in the validation set being positive
    y_val_prob = pig_classifier.predict_proba(X_val)[:, 1]

    # Apply the class prior to the probabilities to obtain the corrected probabilities
    y_val_prob_cal = y_val_prob / class_prior

    # Modify the probabilities using the BD method
    y_val_prob_bd = y_val_prob_cal / (y_val_prob_cal + (1 - class_prior) / class_prior * (1 - y_val_prob_cal) ** alpha)

    # Convert the probabilities to binary labels using the threshold
    y_val_pred = (y_val_prob_bd >= threshold).astype(int)

    # Calculate the F1-score
    f1_score_bd = f1_score(y_val, y_val_pred)

    # Print the F1-score and class prior
    print(f"F1-score for {percent}% of positive examples using BD: {f1_score_bd:.4f}, class prior: {class_prior:.4f}")

    # Append the F1-score to the list of F1-scores
    f1_scores_bd.append(f1_score_bd)

F1-score for 20% of positive examples using BD: 0.3718, class prior: 1.0000
F1-score for 30% of positive examples using BD: 0.3718, class prior: 1.0000
F1-score for 40% of positive examples using BD: 0.3718, class prior: 1.0000
F1-score for 50% of positive examples using BD: 0.3718, class prior: 1.0000


This code calculates the F1-scores for each percentage of positive examples using the BD method for PIG. It uses the same data and parameters as the previous code, and applies the BD method to modify the probabilities obtained from the PIG classifier. It also prints the F1-score for each percentage of positive examples and stores them in a list.