In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

##### Generate the synthetic data

In [2]:
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=2, n_classes=2, flip_y=0, n_redundant=0, n_informative=2, random_state=1)
y = np.where(y == 0, -1, 1)

# Function to introduce label noise
def introduce_label_noise(y, noise_rate_positive, noise_rate_negative):
    y_noisy = y.copy()
    indices_class_1 = np.where(y == 1)[0]
    indices_class_minus_1 = np.where(y == -1)[0]

    n_flip_class_1 = int(noise_rate_positive * len(indices_class_1))
    flip_indices_class_1 = np.random.choice(indices_class_1, size=n_flip_class_1, replace=False)

    n_flip_class_minus_1 = int(noise_rate_negative * len(indices_class_minus_1))
    flip_indices_class_minus_1 = np.random.choice(indices_class_minus_1, size=n_flip_class_minus_1, replace=False)

    y_noisy[flip_indices_class_1] = -y_noisy[flip_indices_class_1]
    y_noisy[flip_indices_class_minus_1] = -y_noisy[flip_indices_class_minus_1]

    return y_noisy

# Introduce variable noise rates
noise_rate_positive = 0.1 # 10% noise for class 1
noise_rate_negative = 0.3 # 30% noise for class -1
y_noisy = introduce_label_noise(y, noise_rate_positive, noise_rate_negative)

## **Method of Unbiased Estimators**

*This method modifies the loss function to create an unbiased estimator of the true loss despite noisy labels.*

In [3]:
# Unbiased logistic loss function
def unbiased_log_loss(y_true, y_pred, rho_plus, rho_minus):
    log_loss = np.log(1 + np.exp(-y_true * y_pred))
    log_loss_neg = np.log(1 + np.exp(y_true * y_pred))
    unbiased_loss = ((1 - rho_minus) * log_loss - rho_plus * log_loss_neg) / (1 - rho_plus - rho_minus)
    return np.mean(unbiased_loss)

# Logistic loss function and its gradient
def logistic_loss_and_grad(X, y, w):
    z = np.dot(X, w)
    loss = np.log(1 + np.exp(-y * z))
    grad = -y[:, np.newaxis] * X / (1 + np.exp(y * z))[:, np.newaxis]
    return np.mean(loss), np.mean(grad, axis=0)

# Custom training loop that minimizes the unbiased loss using gradient descent.
def train_custom_logistic_regression(X, y, rho_plus, rho_minus, learning_rate=0.01, epochs=1000):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    for epoch in range(epochs):
        z = np.dot(X, w)
        unbiased_loss_value = unbiased_log_loss(y, z, rho_plus, rho_minus)
        _, grad = logistic_loss_and_grad(X, y, w)
        w -= learning_rate * grad
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Unbiased Loss: {unbiased_loss_value}')
    return w

# Prepare data for training and testing
X_train, X_test, y_train_clean, y_test_clean = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train_noisy, y_test_noisy = train_test_split(X, y_noisy, test_size=0.3, random_state=42)

# Train the model
weights = train_custom_logistic_regression(X_train, y_train_noisy, rho_plus=noise_rate_positive, rho_minus=noise_rate_negative)

# Predict using the trained weights
y_pred = np.sign(np.dot(X_test, weights))
print(f"Accuracy on clean test set with unbiased estimator loss: {accuracy_score(y_test_clean, y_pred)}")
print("MSE of the model:", mean_squared_error(y_test_clean, y_pred))


Epoch 0, Unbiased Loss: 0.6931471805599452
Epoch 100, Unbiased Loss: 0.6124540599287328
Epoch 200, Unbiased Loss: 0.5747898423591798
Epoch 300, Unbiased Loss: 0.5550735886831
Epoch 400, Unbiased Loss: 0.543753776439849
Epoch 500, Unbiased Loss: 0.5367972077466316
Epoch 600, Unbiased Loss: 0.5323040341710654
Epoch 700, Unbiased Loss: 0.5292936373654412
Epoch 800, Unbiased Loss: 0.5272210776479991
Epoch 900, Unbiased Loss: 0.5257649039989336
Accuracy on clean test set with unbiased estimator loss: 0.8566666666666667
MSE of the model: 0.5733333333333334


## **Method of Label-dependent Costs**

*This method adjusts the cost (or weight) of different types of errors in the loss function to account for label noise.*

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Calculate the label-dependent costs based on noise rates using the correct formula
alpha_star = (0.5 - noise_rate_negative) / (1 - noise_rate_positive - noise_rate_negative)
costs = {1: alpha_star, -1: 1 - alpha_star}

# Instantiate the StandardScaler
scaler = StandardScaler()
# Scale the features to normalize the data to have zero mean and unit variance
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train_clean, y_test_clean = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_test, y_train_noisy, y_test_noisy = train_test_split(X_scaled, y_noisy, test_size=0.3, random_state=42)

# Instantiate and train the Logistic Regression model with label-dependent costs
logistic_model = LogisticRegression(class_weight=costs)
logistic_model.fit(X_train, y_train_noisy)

# Evaluate the model on the clean test set
y_pred = logistic_model.predict(X_test)
print(f"Accuracy on clean test set: {accuracy_score(y_test_clean, y_pred)}")

print("MSE of the logistic regression model:", mean_squared_error(y_test_clean, y_pred))


Accuracy on clean test set: 0.8333333333333334
MSE of the logistic regression model: 0.6666666666666666
