In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

In [3]:
# Step 1: Generate a synthetic dataset
# X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, flip_y=0)
X, y = make_classification(n_samples=1000, n_features=2, n_classes=2, flip_y=0, n_redundant=0, n_informative=2, random_state=1)
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, n_clusters_per_class=2, class_sep=0.5, flip_y=0, random_state=1)
y = np.where(y == 0, -1, 1)  # Convert labels to {-1, +1}

In [4]:
# Different way to implement noise -> increasing noise rate variability across classes
def introduce_label_noise(y, noise_rate_positive, noise_rate_negative):
    y_noisy = y.copy()
    # Indices where class is 1
    indices_class_1 = np.where(y == 1)[0]
    # Indices where class is -1
    indices_class_minus_1 = np.where(y == -1)[0]

    # Determining which indices to flip based on the noise rate for class 1
    n_flip_class_1 = int(noise_rate_positive * len(indices_class_1))
    flip_indices_class_1 = np.random.choice(indices_class_1, size=n_flip_class_1, replace=False)

    # Determining which indices to flip based on the noise rate for class -1
    n_flip_class_minus_1 = int(noise_rate_negative * len(indices_class_minus_1))
    flip_indices_class_minus_1 = np.random.choice(indices_class_minus_1, size=n_flip_class_minus_1, replace=False)

    # Flipping the selected indices
    y_noisy[flip_indices_class_1] = -y_noisy[flip_indices_class_1]
    y_noisy[flip_indices_class_minus_1] = -y_noisy[flip_indices_class_minus_1]

    return y_noisy

# Introduce variable noise rates
noise_rate_positive = 0.1  # 10% noise for class 1
noise_rate_negative = 0.3  # 30% noise for class -1
y_noisy = introduce_label_noise(y, noise_rate_positive, noise_rate_negative)

# **Method of Unbiased Estimators**

In [5]:
# Step 3: Calculate the unbiased estimator of the loss (example with logistic loss)
def unbiased_log_loss(y_true, y_pred, rho_plus, rho_minus):
    # Logistic loss
    log_loss = np.log(1 + np.exp(-y_true * y_pred))
    log_loss_neg = np.log(1 + np.exp(y_true * y_pred))
    # Unbiased estimator
    unbiased_loss = ((1 - rho_minus) * log_loss - rho_plus * log_loss_neg) / (1 - rho_plus - rho_minus)
    return np.mean(unbiased_loss)

In [6]:
# Custom logistic regression model to use unbiased loss
class CustomLogisticRegression(LogisticRegression):
    def fit(self, X, y, rho_plus, rho_minus):
        super().fit(X, y)
        initial_pred = self.decision_function(X) # computes the decision scores for each input
        custom_loss = unbiased_log_loss(y, initial_pred, rho_plus, rho_minus) # Computes the unbiased logistic loss, taking the true labels y (which could be noisy), the raw predictions, and the noise rates rho_plus and rho_minus as inputs.
        print(f"Custom Loss: {custom_loss}")
        return self

In [7]:
# Step 4: Modify the training process
# Assuming rho_+1 = rho_-1 for simplicity
model = CustomLogisticRegression()
# model.fit(X, y_noisy, rho_plus=noise_rate, rho_minus=noise_rate)
model.fit(X, y_noisy, rho_plus=noise_rate_positive, rho_minus=noise_rate_negative)

Custom Loss: 0.5044865145354687


In [8]:
# Step 5: Train and evaluate the model
X_train, X_test, y_train_clean, y_test_clean = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train_noisy, y_test_noisy = train_test_split(X, y_noisy, test_size=0.3, random_state=42)

# model.fit(X_train, y_train_noisy, rho_plus=noise_rate, rho_minus=noise_rate)
model.fit(X_train, y_train_noisy, rho_plus=noise_rate_positive, rho_minus=noise_rate_negative)
y_pred = model.predict(X_test)
print(f"Accuracy on clean test set with unbiased estimator loss: {accuracy_score(y_test_clean, y_pred)}") # How accurate the model is in classifing correctly despite the initial noise in the training labels.

print("MSE of the model:", mean_squared_error(y_test_clean, y_pred))

Custom Loss: 0.4775489526384133
Accuracy on clean test set with unbiased estimator loss: 0.8233333333333334
MSE of the model: 0.7066666666666667


In [10]:
# Train a logistic regression model on noisy labels with standard loss
standard_model = LogisticRegression()
standard_model.fit(X_train, y_train_noisy)

# Evaluate the model on the clean test set
y_pred_standard = standard_model.predict(X_test)
print(f"Accuracy on clean test set with standard loss: {accuracy_score(y_test_clean, y_pred_standard)}")

print("MSE of the logistic regression model:", mean_squared_error(y_test_clean, y_pred_standard))

Accuracy on clean test set with standard loss: 0.8233333333333334
MSE of the logistic regression model: 0.7066666666666667


# **Method of Label-dependent Costs**

In [12]:
# Calculate the label-dependent costs based on noise rates

# alpha_star = (1 - noise_rate) / (1 - 2 * noise_rate) if noise_rate != 0.5 else 1
# alpha_star = max(min((1 - noise_rate) / (1 - 2 * noise_rate), 0.99), 0.01)
alpha_star = max(min(((1 - noise_rate_positive + noise_rate_negative) / 2), 0.99), 0.01)
costs = {1: alpha_star, -1: 1 - alpha_star}


In [13]:
# Instantiate the StandardScaler
scaler = StandardScaler()
# Scale the features to normalize the data to have zero mean and unit variance
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_test, y_train_noisy, y_test_noisy = train_test_split(X_scaled, y_noisy, test_size=0.3, random_state=42)

# y_train_noisy = introduce_label_noise(y_train, noise_rate=noise_rate)
# y_train_noisy = introduce_label_noise(y_train, noise_rate_positive, noise_rate_negative)

In [14]:
# Instantiate the SVC with a specific 'C' value
svc_model = SVC(kernel='linear', class_weight=costs, C=1.0)

# Fit the model
svc_model.fit(X_train, y_train_noisy)

# Evaluate the model on the clean test set
y_pred = svc_model.predict(X_test)
print(f"Accuracy on clean test set: {accuracy_score(y_test, y_pred)}")

print("MSE of the model:", mean_squared_error(y_test, y_pred))

Accuracy on clean test set: 0.74
MSE of the model: 1.04


In [15]:
# Instantiate and train the Logistic Regression model with label-dependent costs
logistic_model = LogisticRegression(class_weight=costs)
logistic_model.fit(X_train, y_train_noisy)

# Evaluate the model on the clean test set
y_pred = logistic_model.predict(X_test)
print(f"Accuracy on clean test set: {accuracy_score(y_test, y_pred)}")

print("MSE of the logistic regression model:", mean_squared_error(y_test, y_pred))

Accuracy on clean test set: 0.74
MSE of the logistic regression model: 1.04
