In [1]:
import numpy as np
import numpy.linalg as lng
from sklearn.linear_model import LogisticRegression
#Load dataset
dataset = np.load("./mnist_dataset.npz")
#training set
Xtr = dataset["Xtr"]
#training noisy label
Str = dataset["Str"]
#test set
Xts = dataset["Xts"]
#test noisy label
Yts = dataset["Yts"]

In [2]:
#estimate the weight of importance reweighting
#prob is probibilty, rho0 and rho1 are flip rate
def estimateBeta(S,prob,rho0,rho1):
    n = len(S)
    beta = np.zeros((n,1))
    for i in range(n):
        if S[i]==1:
            beta[i] = (prob[i][1]-rho0)/((1-rho0-rho1)*prob[i][1]+1e-5)
        else:
            beta[i] = (prob[i][0]-rho1)/((1-rho0-rho1)*(prob[i][0])+1e-5)
    
    return beta

In [4]:
#compute the accuracy
def computeAccuracy(Y,pred_Y):
    acc = 0.0
    for i in range(len(Y)):
        if Y[i] == pred_Y[i]:
            acc += 1.0
    return acc/len(Y)

In [6]:
from sklearn.model_selection import train_test_split
#randomly split the data and get 80% of trainning samples
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  

In [7]:
# implement LogisticRegression without label noise methods
clf = LogisticRegression().fit(X_train, y_train.flatten())
w = clf.coef_[0]
b = clf.intercept_
#predict the result
pred_Y = clf.predict(Xts)
accTrue = computeAccuracy(Yts,pred_Y)
print('The accuracy of LR model trained on clean data is %.4f'%accTrue)

The accuracy of LR model trained on clean data is 0.7620


In [26]:
from sklearn.model_selection import train_test_split
#split the train data, select 80& as samples
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c parameters in LR
accuarcy_c=[]
C = [0.001,0.01,0.1,1, 10, 100,1000]
for i in range(10):
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    clf = LogisticRegression().fit(X_train, y_train.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_c.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

The accuracy of IRLR model trained on noisy data is 0.7620 0
The accuracy of IRLR model trained on noisy data is 0.7620 1
The accuracy of IRLR model trained on noisy data is 0.7620 2
The accuracy of IRLR model trained on noisy data is 0.7620 3
The accuracy of IRLR model trained on noisy data is 0.7620 4
The accuracy of IRLR model trained on noisy data is 0.7620 5
The accuracy of IRLR model trained on noisy data is 0.7620 6
The accuracy of IRLR model trained on noisy data is 0.7620 7
The accuracy of IRLR model trained on noisy data is 0.7620 8
The accuracy of IRLR model trained on noisy data is 0.7620 9


###### classify image without using label noise method
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_c=[]
C = [0.001,0.01,0.1,1, 10, 100,1000]
for c in C:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    clf = LogisticRegression(C=c).fit(X_train, y_train.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_c.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,c)

In [25]:
#unbaised estimator
# max_iter : int, default: 100
#Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.
from sklearn.model_selection import train_test_split 
#tuning c in LR
accuarcy_max_iter =[]
max_iter  = [10,50,80,100,110,120,150]
for i in max_iter:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)      
    clf = LogisticRegression(max_iter =i).fit(X_train, y_train.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_max_iter.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

The accuracy of IRLR model trained on noisy data is 0.7880 10
The accuracy of IRLR model trained on noisy data is 0.7670 50
The accuracy of IRLR model trained on noisy data is 0.7635 80
The accuracy of IRLR model trained on noisy data is 0.7620 100
The accuracy of IRLR model trained on noisy data is 0.7620 110
The accuracy of IRLR model trained on noisy data is 0.7620 120
The accuracy of IRLR model trained on noisy data is 0.7620 150


In [9]:
#the flip rate 
rho1 = 0.4
rho0 = 0.2
probS = clf.predict_proba(X_train)
#get the weight of importance reweighting
weights = estimateBeta(y_train, probS, rho0, rho1)

In [18]:
#use importance reweighting to classifiy the results
clf = LogisticRegression().fit(X_train, y_train.flatten(), weights.flatten())
w = clf.coef_[0]
b = clf.intercept_

pred_Y = clf.predict(Xts)
acc = computeAccuracy(Yts,pred_Y)
print('The accuracy of IRLR model trained on noisy data is %.4f'%acc)

The accuracy of IRLR model trained on noisy data is 0.9080


In [21]:
#parameter tuning
rf = LogisticRegression() 
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_c=[]
C = [0.001,0.01,0.1,1, 10, 100,1000]
for c in C:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(C=c).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_c.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,c)

  y = column_or_1d(y, warn=True)


The accuracy of IRLR model trained on noisy data is 0.9090 0.001
The accuracy of IRLR model trained on noisy data is 0.9065 0.01
The accuracy of IRLR model trained on noisy data is 0.9095 0.1
The accuracy of IRLR model trained on noisy data is 0.9080 1
The accuracy of IRLR model trained on noisy data is 0.9040 10
The accuracy of IRLR model trained on noisy data is 0.9105 100
The accuracy of IRLR model trained on noisy data is 0.9070 1000


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_penalty=[]
penalty = ["l1","l2"]
for i in penalty:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(penalty=i).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_c.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,c)

  y = column_or_1d(y, warn=True)


In [11]:
# max_iter : int, default: 100
#Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_max_iter =[]
max_iter  = [10,50,80,100,110,120,150]
for i in max_iter:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(max_iter =i).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_max_iter.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

  y = column_or_1d(y, warn=True)


The accuracy of IRLR model trained on noisy data is 0.8915 10
The accuracy of IRLR model trained on noisy data is 0.9080 50
The accuracy of IRLR model trained on noisy data is 0.8985 80
The accuracy of IRLR model trained on noisy data is 0.9080 100
The accuracy of IRLR model trained on noisy data is 0.9085 120
The accuracy of IRLR model trained on noisy data is 0.9030 150


In [10]:
# max_iter : int, default: 100
#Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_max_iter =[]
max_iter  = [160,170,180,190,200]
for i in max_iter:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    weights = estimateBeta(y_train, probS, rho0, rho1)
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(max_iter =i).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_max_iter.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

  y = column_or_1d(y, warn=True)


The accuracy of IRLR model trained on noisy data is 0.9080 160
The accuracy of IRLR model trained on noisy data is 0.9080 170
The accuracy of IRLR model trained on noisy data is 0.9065 180
The accuracy of IRLR model trained on noisy data is 0.9085 190
The accuracy of IRLR model trained on noisy data is 0.9090 200


In [12]:
# max_iter : int, default: 100
#Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_max_iter =[]
max_iter  = [110,105,110,120,130,140,150,160]
for i in max_iter:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(max_iter =i).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_max_iter.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)


  y = column_or_1d(y, warn=True)


The accuracy of IRLR model trained on noisy data is 0.9090 110
The accuracy of IRLR model trained on noisy data is 0.9070 105
The accuracy of IRLR model trained on noisy data is 0.9090 110
The accuracy of IRLR model trained on noisy data is 0.9085 120
The accuracy of IRLR model trained on noisy data is 0.9070 130
The accuracy of IRLR model trained on noisy data is 0.9070 140
The accuracy of IRLR model trained on noisy data is 0.9030 150
The accuracy of IRLR model trained on noisy data is 0.9080 160


In [20]:
# ten-folds for the best combination
accuarcy_Best=[]
for i in range(10):
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    #X_train,y_train= RandomData(Xtr,Str)
    clf = LogisticRegression(C=100,max_iter =100).fit(X_train, y_train, weights.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_Best.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

  y = column_or_1d(y, warn=True)


The accuracy of IRLR model trained on noisy data is 0.9105 0
The accuracy of IRLR model trained on noisy data is 0.9105 1
The accuracy of IRLR model trained on noisy data is 0.9105 2
The accuracy of IRLR model trained on noisy data is 0.9105 3
The accuracy of IRLR model trained on noisy data is 0.9105 4
The accuracy of IRLR model trained on noisy data is 0.9105 5
The accuracy of IRLR model trained on noisy data is 0.9105 6
The accuracy of IRLR model trained on noisy data is 0.9105 7
The accuracy of IRLR model trained on noisy data is 0.9105 8
The accuracy of IRLR model trained on noisy data is 0.9105 9


In [24]:
#estimate the flip rate
probS = clf.predict_proba(X_v)
x = 0
#the correct number of labels 0
correct_0 = 0
#the correct number of labels 1
correct_1 =0
#the wrong number of labels 0
wrong_0 = 0
#the wrong number of labels 1
wrong_1 = 0
for L0,L1 in probS:
    #print(L0,L1)
    label = y_v[x][0]
    #if label is 0, compare the problilty of estimated as 0
    if label == 0:
        if L1 >= 0.5:
            wrong_0+=1
        else:
            correct_0+=1
    #if label is 1, compare the problilty of estimated as 1
    if label == 1:
        if L0 >= 0.5:
            wrong_1+=1
        else:
            correct_1+=1
    x +=1 

  np.exp(prob, prob)


In [26]:
# p0(S = 1|Y = 0) = 0.2  p1(S=0|Y =1)=0.4
print(correct_0,correct_1,wrong_0,wrong_1)
print("p0:",wrong_1/(correct_0+wrong_1))
print("p1:",wrong_0/(correct_1+wrong_0))
#print(wrong_0/(correct_1+wrong_0),wrong_1/(correct_0+wrong_1))

771 585 437 207
p0: 0.2116564417177914
p1: 0.42759295499021527


In [12]:
#estimate the weight of unbiased estimator
def eval_weight(y, rho0, rho1):   
    rho_y = y * rho1 + (1 - y) * rho0 # \rho_{y}
    rho_y_flip = y * rho0 + (1 - y) * rho1 # \rho_{-y}
    w = (1 - rho_y_flip) / (1 - rho1 - rho0)
    w_flip = - rho_y / (1 - rho1 - rho0)
    return w, w_flip

In [27]:
rho1 = 0.4
rho0 = 0.2  
#split data
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)
#get the matrix of train and label
Xtr_hat = np.concatenate((X_train, X_train), axis=0) # [data; data].
Str_hat = np.concatenate((y_train, 1-y_train), axis=0) # [label; flipped-label].
print(Xtr_hat.shape, Str_hat.shape) 

W, W_flip = eval_weight(y_train, rho0, rho1)
W_hat = np.concatenate((W, W_flip), axis=0)
print(W_hat.shape)

# train LR classifier with unbiased estimator.
clf = LogisticRegression().fit(Xtr_hat, Str_hat.flatten(), W_hat.flatten())
w = clf.coef_[0]
b = clf.intercept_
pred_Y = clf.predict(Xts)
acc = computeAccuracy(Yts,pred_Y)


(16000, 784) (16000, 1)
(16000, 1)


In [23]:
#unbaised estimator methods with tuning the c parameter from 10 to 150
# max_iter : int, default: 100 
#Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.
from sklearn.model_selection import train_test_split
#tuning c in LR
accuarcy_max_iter =[]
max_iter  = [10,50,80,100,110,120,150]
for i in max_iter:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    Xtr_hat = np.concatenate((X_train, X_train), axis=0) # [data; data].
    Str_hat = np.concatenate((y_train, 1-y_train), axis=0) # [label; flipped-label].
    W, W_flip = eval_weight(y_train, rho0, rho1)
    W_hat = np.concatenate((W, W_flip), axis=0)
    
    clf = LogisticRegression(max_iter =i).fit(Xtr_hat, Str_hat.flatten(), W_hat.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_max_iter.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,i)

The accuracy of IRLR model trained on noisy data is 0.8855 10
The accuracy of IRLR model trained on noisy data is 0.8995 50
The accuracy of IRLR model trained on noisy data is 0.9025 80
The accuracy of IRLR model trained on noisy data is 0.9005 100
The accuracy of IRLR model trained on noisy data is 0.9015 110
The accuracy of IRLR model trained on noisy data is 0.8965 120
The accuracy of IRLR model trained on noisy data is 0.9015 150


In [13]:
#unbaised estimator methods with tuning the c parameter from 160 to 200
from sklearn.model_selection import train_test_split
X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
#tuning c in LR
accuarcy_c=[]
C = [160,170,180,190,200]
for c in C:
    X_train, X_v, y_train, y_v = train_test_split(Xtr, Str, test_size=0.2, random_state=0,shuffle=True)  
    Xtr_hat = np.concatenate((X_train, X_train), axis=0) # [data; data].
    Str_hat = np.concatenate((y_train, 1-y_train), axis=0) # [label; flipped-label].
    W, W_flip = eval_weight(y_train, rho0, rho1)
    W_hat = np.concatenate((W, W_flip), axis=0)
    
    clf = LogisticRegression(C=c).fit(Xtr_hat, Str_hat.flatten(), W_hat.flatten())
    pred_Y = clf.predict(Xts)
    acc = computeAccuracy(Yts,pred_Y)
    accuarcy_c.append(acc)
    print('The accuracy of IRLR model trained on noisy data is %.4f'%acc,c)

The accuracy of IRLR model trained on noisy data is 0.9060 160
The accuracy of IRLR model trained on noisy data is 0.9090 170
The accuracy of IRLR model trained on noisy data is 0.9045 180
The accuracy of IRLR model trained on noisy data is 0.9025 190
The accuracy of IRLR model trained on noisy data is 0.9045 200
