Preprocessing (PIG)

In [2]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Set the percentage of positive examples to select
percentages = [20, 30, 40, 50]

# Initialize the F1-score array
f1_scores = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Split the combined data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    from sklearn.preprocessing import LabelEncoder

    # Encode the target variable y
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)

    # Train a logistic regression model on the training data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict the class probabilities for the validation data
    y_prob = model.predict_proba(X_val)

    # Calculate the class prior using the model-based approach
    class_prior = y_prob[:, 1].mean()

    print(f"Class prior for {percent}% of positive examples: {class_prior:.4f}")

    # Perform PIG learning using the class prior
    y_pig = y_selected / class_prior

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_pig, np.zeros(X_unlabeled.shape[0])])

    # Train a logistic regression model on the training data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Make predictions on the validation data
    y_pred = model.predict(X_val)

    # Calculate the F1-score and store it in the array
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)

# Print the F1-scores for each percentage
for i, percent in enumerate(percentages):
    print(f"F1-score for {percent}% of positive examples: {f1_scores[i]:.4f}")


Class prior for 20% of positive examples: 0.0979
Class prior for 30% of positive examples: 0.1457
Class prior for 40% of positive examples: 0.1868
Class prior for 50% of positive examples: 0.2279
F1-score for 20% of positive examples: 0.0000
F1-score for 30% of positive examples: 0.0000
F1-score for 40% of positive examples: 0.3333
F1-score for 50% of positive examples: 0.3077


The process described in the code below is consistent with the Confidence-Aware Learning (CAL) approach in Positive-Unlabeled (PU) learning, where the class prior estimation method used is the model-based estimation method. In this approach, a logistic regression classifier is trained on a combined dataset of selected positive examples and unlabeled examples. The predicted probabilities for the unlabeled examples are used to estimate the class prior, and an adjusted probability threshold is calculated based on this estimate. The unlabeled examples are then labeled based on this adjusted threshold, and the F1-score is calculated for the labeled examples. The process is repeated for different percentages of positive examples to select, and the results (class prior and F1-score) are printed to the console.

In [3]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Set the percentage of positive examples to select
percentages = [20, 30, 40, 50]

# Initialize the F1-score array
f1_scores = []

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Train a logistic regression classifier on the combined dataset
    model = LogisticRegression()
    model.fit(X_combined, y_combined)

    # Calculate the predicted probabilities for the unlabeled examples
    proba_unlabeled = model.predict_proba(X_unlabeled)[:, 1]

    # Calculate the class prior using the model-based approach
    class_prior = model.intercept_[0]

    print(f"Class prior for {percent}% of positive examples: {class_prior:.4f}")

    # Adjust the probability threshold based on the class prior
    threshold = class_prior / (1 - class_prior)

    # Label the unlabeled examples based on the adjusted probability threshold
    y_pred = (proba_unlabeled >= threshold).astype(int)

    # Create an array with labels for all positive examples
    y_true_all = np.concatenate([np.ones(X_positive.shape[0]), np.zeros(X_unlabeled.shape[0])])

    # Select the labels for the selected positive examples
    y_true = y_true_all[selected_indices]

    # Predict the labels for all positive examples
    y_pred_all = np.concatenate([model.predict(X_positive), np.zeros(X_unlabeled.shape[0])])

    # Select the predictions for the selected positive examples
    y_pred = y_pred_all[selected_indices]

    # Calculate the F1-score for the labeled examples
    f1 = f1_score(y_true, y_pred, pos_label=1)

    # Add the F1-score to the array
    f1_scores.append(f1)

    print(f"F1-score for {percent}% of positive examples: {f1:.4f}")


Class prior for 20% of positive examples: -2.2524
F1-score for 20% of positive examples: 0.0000
Class prior for 30% of positive examples: -1.8687
F1-score for 30% of positive examples: 0.0488
Class prior for 40% of positive examples: -1.6212
F1-score for 40% of positive examples: 0.1709
Class prior for 50% of positive examples: -1.3526
F1-score for 50% of positive examples: 0.1389


In [5]:
#Initialize the alpha value
alpha = 0.5

#Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

# Select a random subset of positive examples
selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
X_selected = X_positive[selected_indices]
y_selected = np.ones(num_examples)

# Combine the selected positive examples with the unlabeled examples
X_combined = np.concatenate([X_selected, X_unlabeled])
y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

# Calculate the empirical class frequency in the labeled examples
p_labeled = np.mean(y_selected)

# Calculate the class prior using the model-based approach
model = LogisticRegression()
model.fit(X_combined, y_combined)
p_unlabeled = model.predict_proba(X_unlabeled)[:,1].mean()

# Calculate the modified class prior
p_c = (1 - alpha) * p_labeled + alpha * p_unlabeled

print(f"Modified class prior for {percent}% of positive examples: {p_c:.4f}")

# Train a logistic regression model with the modified class prior
model = LogisticRegression(class_weight={1: p_c, 0: 1 - p_c})
model.fit(X_combined, y_combined)

# Predict the labels for all positive examples
y_pred_all = np.concatenate([model.predict(X_positive), np.zeros(X_unlabeled.shape[0])])

# Select the predictions for the selected positive examples
y_pred = y_pred_all[selected_indices]

# Create an array with labels for all positive examples
y_true_all = np.concatenate([np.ones(X_positive.shape[0]), np.zeros(X_unlabeled.shape[0])])

# Select the labels for the selected positive examples
y_true = y_true_all[selected_indices]

# Calculate the F1-score for the labeled examples
f1 = f1_score(y_true, y_pred, pos_label=1)

# Add the F1-score to the array
f1_scores.append(f1)

print(f"F1-score for {percent}% of positive examples: {f1:.4f}")


Modified class prior for 50% of positive examples: 0.5898
F1-score for 50% of positive examples: 0.3952


Method Modification

In [6]:
# Initialize the alpha value
alpha = 0.5

# Iterate over the selected percentages
for percent in percentages:
    # Calculate the number of positive examples to select
    num_examples = int(X_positive.shape[0] * percent / 100)

    # Select a random subset of positive examples
    selected_indices = np.random.choice(X_positive.shape[0], size=num_examples, replace=False)
    X_selected = X_positive[selected_indices]
    y_selected = np.ones(num_examples)

    # Combine the selected positive examples with the unlabeled examples
    X_combined = np.concatenate([X_selected, X_unlabeled])
    y_combined = np.concatenate([y_selected, np.zeros(X_unlabeled.shape[0])])

    # Calculate the empirical class frequency in the labeled examples
    p_labeled = np.mean(y_selected)

    # Calculate the class prior using cross-validation
    model = LogisticRegression()
    class_prior_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring="precision")

    # Calculate the average class prior across all folds
    p_unlabeled = np.mean(class_prior_scores)

    # Calculate the modified class prior
    p_c = (1 - alpha) * p_labeled + alpha * p_unlabeled

    print(f"Modified class prior for {percent}% of positive examples: {p_c:.4f}")

    # Train a logistic regression model with the modified class prior
    model = LogisticRegression(class_weight={1: p_c, 0: 1 - p_c})
    model.fit(X_combined, y_combined)

    # Predict the labels for all positive examples
    y_pred_all = np.concatenate([model.predict(X_positive), np.zeros(X_unlabeled.shape[0])])

    # Select the predictions for the selected positive examples
    y_pred = y_pred_all[selected_indices]

    # Create an array with labels for all positive examples
    y_true_all = np.concatenate([np.ones(X_positive.shape[0]), np.zeros(X_unlabeled.shape[0])])

    # Select the labels for the selected positive examples
    y_true = y_true_all[selected_indices]

    # Calculate the F1-score for the labeled examples
    f1 = f1_score(y_true, y_pred, pos_label=1)

    # Add the F1-score to the array
    f1_scores.append(f1)

    print(f"F1-score for {percent}% of positive examples: {f1:.4f}\n")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Modified class prior for 20% of positive examples: 0.5000
F1-score for 20% of positive examples: 0.0000

Modified class prior for 30% of positive examples: 0.6500
F1-score for 30% of positive examples: 0.2418

Modified class prior for 40% of positive examples: 0.8167
F1-score for 40% of positive examples: 0.7910

Modified class prior for 50% of positive examples: 0.8000
F1-score for 50% of positive examples: 0.8498

