In [1]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Set the number of folds for cross-validation
k = 5

# Split the positive examples into k folds
positive_folds = np.array_split(X_train_positive, k)

# Estimate the class prior using cross-validation
class_prior_cv = 0
for fold in range(k):
    # Split the remaining (K-1) folds into training and validation sets
    train_indices = [i for i in range(k) if i != fold]
    X_train = np.concatenate([positive_folds[i] for i in train_indices])
    y_train = np.ones(X_train.shape[0])
    
    # Estimate the class prior using the current training set
    current_class_prior = np.mean(y_train)
    
    # Add a small constant to the class prior estimate to avoid division by zero
    current_class_prior = max(min(current_class_prior, 1 - 1e-10), 1e-10)
    
    # Add the current estimate to the running total
    class_prior_cv += current_class_prior

# Average the K estimates of the class prior
class_prior_cv /= k

# Calculate the class ratio
class_ratio = class_prior_cv / (1 - class_prior_cv)

# Train the logistic regression classifier on the positive and unlabeled training examples
X_train = np.concatenate([X_train_positive, X_train_unlabeled])
y_train = np.concatenate([np.ones(X_train_positive.shape[0]), np.zeros(X_train_unlabeled.shape[0])])
classifier = LogisticRegression().fit(X_train, y_train)

# Predict the labels for the validation set
X_val = np.concatenate([X_val_positive, X_val_unlabeled])
y_val_pred = classifier.predict(X_val)

# Binarize the predicted scores
y_val_bin = np.where(y_val_pred >= 0.5, 1, 0)


# Calculate the F1 score for the validation set
f1 = f1_score(np.concatenate([y_val_positive, y_val_unlabeled]), y_val_bin)

# Print the F1 score and Class Prior
print("Class prior: {:.2f}".format(class_prior_cv))
print("F1-score: {:.3f}".format(f1))

Class prior: 1.00
F1-score: 0.436


The code trains a logistic regression classifier on a dataset of diabetes patients and non-diabetes patients. The dataset has two classes: positive and unlabeled. The positive class corresponds to patients with diabetes, while the unlabeled class corresponds to patients whose diabetes status is unknown. The goal is to estimate the F1 score of the classifier on the validation set.

To achieve this, the code first performs PCA on the data to reduce its dimensionality. It then splits the data into positive and unlabeled sets and further splits the positive set and some of the unlabeled set into training and validation sets. Next, the code estimates the class prior by cross-validation. The class prior is the proportion of positive examples in the training set. The code then calculates the class ratio, which is the ratio of the class prior to the complement of the class prior. Finally, the code trains a logistic regression classifier on the positive and unlabeled training examples and predicts the labels for the validation set. It then binarizes the predicted scores, calculates the F1 score for the validation set, and prints the class prior and F1 score.

The F1 score is a measure of the classifier's accuracy, combining the precision and recall. The class prior is the proportion of positive examples in the training set.

In [2]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)


# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(X)[:, 1]

# Estimate the class prior using the model-based method
class_prior_model = np.mean(y_prob)

# Calculate the class ratio
class_ratio = class_prior_model / (1 - class_prior_model)

# Train the logistic regression classifier on the positive and unlabeled training examples
X_train = np.concatenate([X_train_positive, X_train_unlabeled])
y_train = np.concatenate([np.ones(X_train_positive.shape[0]), np.zeros(X_train_unlabeled.shape[0])])
classifier = LogisticRegression().fit(X_train, y_train)

# Predict the labels for the validation set
X_val = np.concatenate([X_val_positive, X_val_unlabeled])
y_val_pred = classifier.predict(X_val)

# Binarize the predicted scores
y_val_bin = np.where(y_val_pred >= 0.5, 1, 0)


# Calculate the F1 score for the validation set
f1 = f1_score(np.concatenate([y_val_positive, y_val_unlabeled]), y_val_bin)

# Print the F1 score and Class Prior
print("Class prior: {:.2f}".format(class_prior_model))
print("F1-score: {:.3f}".format(f1))

Class prior: 0.35
F1-score: 0.436


In [3]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

# Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

# Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Fit a logistic regression model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(X)[:, 1]


# Estimate the class prior using the MLE method
class_prior_mle = np.mean(y_train)


# Calculate the class ratio
class_ratio = class_prior_mle / (1 - class_prior_mle)

# Train the logistic regression classifier on the positive and unlabeled training examples
X_train = np.concatenate([X_train_positive, X_train_unlabeled])
y_train = np.concatenate([np.ones(X_train_positive.shape[0]), np.zeros(X_train_unlabeled.shape[0])])
classifier = LogisticRegression().fit(X_train, y_train)

# Predict the labels for the validation set
X_val = np.concatenate([X_val_positive, X_val_unlabeled])
y_val_pred = classifier.predict(X_val)

# Binarize the predicted scores
y_val_bin = np.where(y_val_pred >= 0.5, 1, 0)


# Calculate the F1 score for the validation set
f1 = f1_score(np.concatenate([y_val_positive, y_val_unlabeled]), y_val_bin)

# Print the F1 score and Class Prior
print("Class prior: {:.2f}".format(class_prior_mle))
print("F1-score: {:.3f}".format(f1))

Class prior: 0.35
F1-score: 0.436


In [4]:
import numpy as np
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data from the MAT-file
data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
labels = data["labels"]
X = data["X"]

#Perform PCA on the data
pca = PCA(n_components=2)
X = pca.fit_transform(X)

#Split the data into positive and unlabeled sets
positive_indices = np.where(labels == 1)[0]
unlabeled_indices = np.where(labels == 0)[0]
X_positive = X[positive_indices, :]
X_unlabeled = X[unlabeled_indices, :]

#Split the positive examples and some of the unlabeled examples into training and validation sets
X_train_positive, X_val_positive, y_train_positive, y_val_positive = train_test_split(X_positive, np.ones(X_positive.shape[0]), test_size=0.2, random_state=42)
X_train_unlabeled, X_val_unlabeled, y_train_unlabeled, y_val_unlabeled = train_test_split(X_unlabeled, np.zeros(X_unlabeled.shape[0]), test_size=0.2, random_state=42)

# Train the logistic regression classifier on the positive and unlabeled training examples
X_train = np.concatenate([X_train_positive, X_train_unlabeled])
y_train = np.concatenate([np.ones(X_train_positive.shape[0]), np.zeros(X_train_unlabeled.shape[0])])
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Calculate the probability of the positive class for each example
y_prob = clf.predict_proba(X)[:, 1]


#Load the external dataset
external_data = sio.loadmat('/home/kofi/Downloads/Matlab/diabetes.mat')
external_labels = external_data["labels"]

#Calculate the class prior from the external dataset
class_prior_external = np.mean(external_labels)

#Calculate the class ratio
class_ratio = class_prior_external / (1 - class_prior_external)

# Train the logistic regression classifier on the positive and unlabeled training examples
X_train = np.concatenate([X_train_positive, X_train_unlabeled])
y_train = np.concatenate([np.ones(X_train_positive.shape[0]), np.zeros(X_train_unlabeled.shape[0])])
classifier = LogisticRegression().fit(X_train, y_train)

#Predict the labels for the validation set
X_val = np.concatenate([X_val_positive, X_val_unlabeled])
y_val_pred = classifier.predict(X_val)

#Binarize the predicted scores
y_val_bin = np.where(y_val_pred >= 0.5, 1, 0)

#Calculate the F1 score for the validation set
f1 = f1_score(np.concatenate([y_val_positive, y_val_unlabeled]), y_val_bin)

#Print the F1 score and Class Prior
print("Class prior: {:.2f}".format(class_prior_external))
print("F1-score: {:.3f}".format(f1))

Class prior: 0.35
F1-score: 0.436
