In [1]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Set the number of folds for cross-validation
k = 5

# Split the positive examples into k folds
positive_folds = np.array_split(X_train_positive, k)

# Estimate the class prior using cross-validation
class_prior_cv = 0
for fold in range(k):
    # Split the remaining (K-1) folds into training and validation sets
    train_indices = [i for i in range(k) if i != fold]
    X_train = np.concatenate([positive_folds[i] for i in train_indices])
    y_train = np.ones(X_train.shape[0])
    
    # Estimate the class prior using the current training set
    current_class_prior = np.mean(y_train)
    
    # Add the current estimate to the running total
    class_prior_cv += current_class_prior

# Average the K estimates of the class prior
class_prior_cv /= k


# Perform PIG preprocessing on the unlabeled set
y_train_unlabeled_pig = (1 - class_prior_cv) / class_prior_cv * y_train_unlabeled

# Combine the positive and preprocessed unlabeled sets in the training set
X_train = np.concatenate((X_train_positive, X_train_unlabeled))
y_train = np.concatenate((y_train_positive, y_train_unlabeled_pig))

# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Use the validation set to evaluate the performance of the classifier
y_pred = clf.predict(np.concatenate((X_val_positive, X_val_unlabeled)))
y_true = np.concatenate((y_val_positive, np.zeros(X_val_unlabeled.shape[0])))
f1 = f1_score(y_true, y_pred)

print("Class prior: {:.2f}".format(class_prior_cv))
print("F1-score on validation set: {:.2f}".format(f1))


Class prior: 1.00
F1-score on validation set: 0.44


Explanation to the above code
The main changes to the original code are:

    1. Estimate the class prior from the positive examples.(Cross Validation Method)
    2. Perform PIG preprocessing on the unlabeled set by applying a correction factor to the labels. The formula for the correction factor is (1 - class_prior) / class_prior, which takes into account the estimated class prior.
    3. Combine the positive and preprocessed unlabeled sets in the training set.
    4. Calculate the F1-score instead of the accuracy on the validation set.

The resulting output is the estimated class prior and the F1-score of the classifier on the validation set, which are printed to the console. Note that the F1-score is a better metric than accuracy for imbalanced datasets like this one, since it takes into account both precision and recall.

In [6]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Fit a logistic regression model on the positive and some of the unlabeled examples
clf = LogisticRegression()
clf.fit(np.concatenate((X_train_positive, X_train_unlabeled)), np.concatenate((y_train_positive, y_train_unlabeled)))

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(X)[:, 1]

# Estimate the class prior using the model-based method
class_prior_model = np.mean(y_prob)

# Perform PIG preprocessing on the unlabeled set
y_train_unlabeled_pig = (1 - class_prior_model) / class_prior_model * y_train_unlabeled

# Combine the positive and preprocessed unlabeled sets in the training set
X_train = np.concatenate((X_train_positive, X_train_unlabeled))
y_train = np.concatenate((y_train_positive, y_train_unlabeled_pig))

# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Use the validation set to evaluate the performance of the classifier
y_pred = clf.predict(np.concatenate((X_val_positive, X_val_unlabeled)))
y_true = np.concatenate((y_val_positive, np.zeros(X_val_unlabeled.shape[0])))
f1 = f1_score(y_true, y_pred)

print("Class prior (model-based): {:.2f}".format(class_prior_model))
print("F1-score on validation set: {:.2f}".format(f1))


Class prior (model-based): 0.35
F1-score on validation set: 0.44


Explanation to the above code
The main changes to the original code are:

    1. Estimate the class prior from the positive examples. (USing the Model -based Estimation method)
    2. Perform PIG preprocessing on the unlabeled set by applying a correction factor to the labels. The formula for the correction factor is (1 - class_prior) / class_prior, which takes into account the estimated class prior.
    3. Combine the positive and preprocessed unlabeled sets in the training set.
    4. Calculate the F1-score instead of the accuracy on the validation set.

The resulting output is the estimated class prior and the F1-score of the classifier on the validation set, which are printed to the console. Note that the F1-score is a better metric than accuracy for imbalanced datasets like this one, since it takes into account both precision and recall.

In [7]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.5, random_state=42)

# Fit a logistic regression model on the positive training set and unlabeled negative set to obtain a probability estimate
clf = LogisticRegression()
X_train_negative = X_train_unlabeled[:len(X_train_positive), :]
y_train_negative = np.zeros(len(X_train_negative))
X_train = np.concatenate((X_train_positive, X_train_negative))
y_train = np.concatenate((y_train_positive, y_train_negative))
clf.fit(X_train, y_train)

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(np.concatenate((X_train_positive, X_train_unlabeled)))[:, 1]

# Load the external dataset
external_data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
external_labels = external_data["labels"]


# Estimate the class prior using the MLE method
class_prior_mle = np.mean(y_prob)

# Perform PIG preprocessing on the unlabeled set
y_train_unlabeled_pig = (1 - class_prior_mle) / class_prior_mle * y_train_unlabeled

# Combine the positive and preprocessed unlabeled sets in the training set
X_train = np.concatenate((X_train_positive, X_train_unlabeled))
y_train = np.concatenate((y_train_positive, y_train_unlabeled_pig))

# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Use the validation set to evaluate the performance of the classifier
y_pred = clf.predict(np.concatenate((X_val_positive, X_val_unlabeled)))
y_true = np.concatenate((y_val_positive, np.zeros(X_val_unlabeled.shape[0])))
f1 = f1_score(y_true, y_pred)

print("Class prior (MLE): {:.2f}".format(class_prior_mle))
print("F1-score on validation set: {:.2f}".format(f1))


Class prior (MLE): 0.49
F1-score on validation set: 0.46


Explanation to the above code
The main changes to the original code are:

    1. Estimate the class prior from the positive examples. (USing the MLE method)
    2. Perform PIG preprocessing on the unlabeled set by applying a correction factor to the labels. The formula for the correction factor is (1 - class_prior) / class_prior, which takes into account the estimated class prior.
    3. Combine the positive and preprocessed unlabeled sets in the training set.
    4. Calculate the F1-score instead of the accuracy on the validation set.

The resulting output is the estimated class prior and the F1-score of the classifier on the validation set, which are printed to the console. Note that the F1-score is a better metric than accuracy for imbalanced datasets like this one, since it takes into account both precision and recall.

In [8]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(X)[:, 1]

# Load the external dataset
external_data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
external_labels = external_data["labels"]

# Calculate the class prior from the external dataset
class_prior_external = np.mean(external_labels)




# Perform PIG preprocessing on the unlabeled set
y_train_unlabeled_pig = (1 - class_prior_external) / class_prior_external * y_train_unlabeled

# Combine the positive and preprocessed unlabeled sets in the training set
X_train = np.concatenate((X_train_positive, X_train_unlabeled))
y_train = np.concatenate((y_train_positive, y_train_unlabeled_pig))

# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Use the validation set to evaluate the performance of the classifier
y_pred = clf.predict(np.concatenate((X_val_positive, X_val_unlabeled)))
y_true = np.concatenate((y_val_positive, np.zeros(X_val_unlabeled.shape[0])))
f1 = f1_score(y_true, y_pred)

print("Class prior (external): {:.2f}".format(class_prior_external))
print("F1-score on validation set: {:.2f}".format(f1))

Class prior (external): 0.35
F1-score on validation set: 0.44
