### Example of a known noise transition matrix T

In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(0)
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, size=1000)  # Binary classification

# Introduce noise
noise_rate = 0.2
y_noisy = y.copy()
n_noisy = int(noise_rate * y.shape[0])
noise_indices = np.random.choice(y.shape[0], n_noisy, replace=False)
y_noisy[noise_indices] = 1 - y_noisy[noise_indices]  # Flip the labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_train_noisy, _ = train_test_split(X, y_noisy, test_size=0.2, random_state=42)

# Estimate noise transition matrix T
T = np.array([[0.8, 0.2], [0.2, 0.8]])  # Example of a known noise transition matrix

# Compute importance weights
def compute_importance_weights(y_true, y_noisy, T):
    weights = np.zeros_like(y_noisy, dtype=float)
    for i in range(len(y_noisy)):
        weights[i] = T[y_true[i], y_noisy[i]] / T[y_noisy[i], y_noisy[i]]
    return weights

weights = compute_importance_weights(y_train, y_train_noisy, T)

# Train a classifier with weighted samples
clf = LogisticRegression()
clf.fit(X_train, y_train_noisy, sample_weight=weights)

# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
print("MSE of the model:", mean_squared_error(y_test, y_pred))

Accuracy on test set: 0.46
MSE of the model: 0.54


### Estimate T using KDE

In [12]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(0)
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, size=1000)

# Introduce noise
noise_rate = 0.2
y_noisy = y.copy()
n_noisy = int(noise_rate * y.shape[0])
noise_indices = np.random.choice(y.shape[0], n_noisy, replace=False)
y_noisy[noise_indices] = 1 - y_noisy[noise_indices]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_train_noisy, _ = train_test_split(X, y_noisy, test_size=0.2, random_state=42)

# Kernel Density Estimation (KDE)
def estimate_noise_transition_matrix(X, y_noisy, noise_rate):
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
    log_density = kde.score_samples(X) # compute log-density of the data points
    density = np.exp(log_density)
    
    # Estimate transition probabilities
    p_y_given_x = np.mean(density[y_noisy == 1]) / np.mean(density)
    p_not_y_given_x = 1 - p_y_given_x
    
    # Transition matrix (for binary classification)
    T = np.array([[1 - p_not_y_given_x, p_not_y_given_x], 
                  [p_y_given_x, 1 - p_y_given_x]])
    return T

T = estimate_noise_transition_matrix(X_train, y_train_noisy, noise_rate)

# Compute importance weights
def compute_importance_weights(y_true, y_noisy, T):
    weights = np.zeros_like(y_noisy, dtype=float)
    for i in range(len(y_noisy)):
        true_class = y_true[i]
        noisy_class = y_noisy[i]
        weights[i] = T[true_class, noisy_class] / T[noisy_class, noisy_class]
    return weights

weights = compute_importance_weights(y_train, y_train_noisy, T)

# Train a logistic regression classifier with weighted samples
clf = LogisticRegression()
clf.fit(X_train, y_train_noisy, sample_weight=weights)
# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression with Importance Reweighting Accuracy on test set:", accuracy)

# Train Logistic Regression on noisy data
clf_logistic = LogisticRegression()
clf_logistic.fit(X_train, y_train_noisy)
# Evaluate the classifiers
y_pred_logistic = clf_logistic.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy on clean test set:", accuracy_logistic)

# Train SVM on noisy data
clf_svm = SVC()
clf_svm.fit(X_train, y_train_noisy)
# Evaluate the classifiers
y_pred_svm = clf_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy on clean test set:", accuracy_svm)



Logistic Regression with Importance Reweighting Accuracy on test set: 0.48
Logistic Regression Accuracy on clean test set: 0.48
SVM Accuracy on clean test set: 0.545


### Example of KDE and asymmetric noise

In [13]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(0)
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, size=1000)  # Binary classification

# Introduce asymmetric noise
asym_noise_rate_0_to_1 = 0.1  # Noise rate for class 0 flipped to class 1
asym_noise_rate_1_to_0 = 0.3  # Noise rate for class 1 flipped to class 0
y_noisy = y.copy()

# Flip labels from 0 to 1 with probability asym_noise_rate_0_to_1
indices_0_to_1 = np.where(y == 0)[0]
flip_indices_0_to_1 = np.random.choice(indices_0_to_1, int(asym_noise_rate_0_to_1 * len(indices_0_to_1)), replace=False)
y_noisy[flip_indices_0_to_1] = 1

# Flip labels from 1 to 0 with probability asym_noise_rate_1_to_0
indices_1_to_0 = np.where(y == 1)[0]
flip_indices_1_to_0 = np.random.choice(indices_1_to_0, int(asym_noise_rate_1_to_0 * len(indices_1_to_0)), replace=False)
y_noisy[flip_indices_1_to_0] = 0

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, y_train_noisy, _ = train_test_split(X, y_noisy, test_size=0.2, random_state=42)

# Kernel Density Estimation (KDE)
def estimate_noise_transition_matrix(X, y_noisy, noise_rate):
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
    log_density = kde.score_samples(X) # compute log-density of the data points
    density = np.exp(log_density)
    
    # Estimate transition probabilities
    p_y_given_x = np.mean(density[y_noisy == 1]) / np.mean(density)
    p_not_y_given_x = 1 - p_y_given_x
    
    # Transition matrix (for binary classification)
    T = np.array([[1 - p_not_y_given_x, p_not_y_given_x], 
                  [p_y_given_x, 1 - p_y_given_x]])
    return T

T = estimate_noise_transition_matrix(X_train, y_train_noisy, noise_rate)

# Compute importance weights
def compute_importance_weights(y_true, y_noisy, T):
    weights = np.zeros_like(y_noisy, dtype=float)
    for i in range(len(y_noisy)):
        true_class = y_true[i]
        noisy_class = y_noisy[i]
        weights[i] = T[true_class, noisy_class] / T[noisy_class, noisy_class]
    return weights

weights = compute_importance_weights(y_train, y_train_noisy, T)

# Train a logistic regression classifier with weighted samples
clf = LogisticRegression()
clf.fit(X_train, y_train_noisy, sample_weight=weights)
# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression with Importance Reweighting Accuracy on test set:", accuracy)

# Train Logistic Regression on noisy data
clf_logistic = LogisticRegression()
clf_logistic.fit(X_train, y_train_noisy)
# Evaluate the classifiers
y_pred_logistic = clf_logistic.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy on clean test set with asymmetric noise:", accuracy_logistic)

# Train SVM on noisy data
clf_svm = SVC()
clf_svm.fit(X_train, y_train_noisy)
y_pred_svm = clf_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy on clean test set with asymmetric noise:", accuracy_svm)



Logistic Regression with Importance Reweighting Accuracy on test set: 0.51
Logistic Regression Accuracy on clean test set with asymmetric noise: 0.51
SVM Accuracy on clean test set with asymmetric noise: 0.51
