In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(0)
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, size=1000)  # Binary classification

# Introduce noise
noise_rate = 0.2
y_noisy = y.copy()
n_noisy = int(noise_rate * y.shape[0])
noise_indices = np.random.choice(y.shape[0], n_noisy, replace=False)
y_noisy[noise_indices] = 1 - y_noisy[noise_indices]  # Flip the labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_train_noisy, _ = train_test_split(X, y_noisy, test_size=0.2, random_state=42)

# Estimate noise transition matrix T
T = np.array([[0.8, 0.2], [0.2, 0.8]])  # Example of a known noise transition matrix

# Compute importance weights
def compute_importance_weights(y_true, y_noisy, T):
    weights = np.zeros_like(y_noisy, dtype=float)
    for i in range(len(y_noisy)):
        weights[i] = T[y_true[i], y_noisy[i]] / T[y_noisy[i], y_noisy[i]]
    return weights

weights = compute_importance_weights(y_train, y_train_noisy, T)

# Train a classifier with weighted samples
clf = LogisticRegression()
clf.fit(X_train, y_train_noisy, sample_weight=weights)

# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
print("MSE of the model:", mean_squared_error(y_test, y_pred))

Accuracy on test set: 0.46
MSE of the model: 0.54
