Code for Reproducing Table 3 of the paper:

  ``**Optimal Labeler Assignment and Sampling for Active Learning in the Presence of Imperfect Labels**''

---------------------------------

**Importing Packages:**

In [4]:
pip install git+https://github.com/modAL-python/modAL.git

Collecting git+https://github.com/modAL-python/modAL.git
  Cloning https://github.com/modAL-python/modAL.git to /tmp/pip-req-build-_vd141g2
  Running command git clone --filter=blob:none --quiet https://github.com/modAL-python/modAL.git /tmp/pip-req-build-_vd141g2
  Resolved https://github.com/modAL-python/modAL.git to commit bba6f6fd00dbb862b1e09259b78caf6cffa2e755
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting skorch==0.9.0 (from modAL-python==0.4.2)
  Downloading skorch-0.9.0-py3-none-any.whl.metadata (8.3 kB)
Downloading skorch-0.9.0-py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.8/125.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: modAL-python
  Building wheel for modAL-python (setup.py) ... [?25l[?25hdone
  Created wheel for modAL-python: filename=modAL_python-0.4.2-py3-none-any.whl size=32647 sha256=5fd0b930ddb6207b1cde51f0057f76e0ee1b8de51279daaf1812fafbe6894cde
  Stor

In [5]:
import copy
import random
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import entropy
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling
from sklearn.model_selection import train_test_split

**Defining Required Functions:**

In [6]:
def data_read(data):
    if data == 'Heart Statlog':
        ##########Statlog (heart) UCI
        data = pd.read_csv('heart.dat', header =None, delimiter=' ')
        data = np.asarray(data)
        X = data[:,:13]
        y = data[:,13]
        y = y - 1
    elif data == 'Ionosphere':
        ###########Ionosphere
        data = pd.read_csv('ionosphere.data' , header =None)
        data = np.asarray(data)
        X = data[:,0:34]
        y = data[:,34]
        y[[i for i in range(len(y)) if y[i] == 'g']] = 0
        y[[i for i in range(len(y)) if y[i] == 'b']] = 1
        y=y.astype('int')
    elif data == 'Sonar':
        ##########sonar data
        data = pd.read_csv('sonar.all-data' , header =None)
        data = np.asarray(data)
        X = data[:,0:60]
        y = data[:,60]
        y[[i for i in range(len(y)) if y[i] == 'M']] = 0
        y[[i for i in range(len(y)) if y[i] == 'R']] = 1
        y=y.astype('int')
    elif data == 'Spambase':
        ###############Spmbased UCI
        data = pd.read_csv('spambase.data',header =None)
        data = np.asarray(data)
        X = data[:,:57]
        y = data[:,57]
    else:
        print(" Data set not found!")
    return X,y

def noisy_RLA(y_true, model_entropy_query):
    Acc = copy.copy(A_boosted)
    n = len(y_true)
    class_list=list(range(c))

    max_noise = 0
    total_noise = 0
    total_flip = 0
    noise_list = []
    y_noisy = []
    for i in range(n):
        random_index = np.random.randint(0, len(Acc))
        acc_selected_labeler = Acc.pop(random_index)
        noise = epsilon(acc_selected_labeler, model_entropy_query[i])
        if noise > max_noise:
            max_noise = noise
        total_noise = total_noise + noise
        noise_list.append(noise)
        """
        higher noise, higher chance of flip
        noise > alpha : flip
        higher alpha returns less noisy labels_ lower alpha means harder _
        """
        if alpha >= noise: #if decides correctly
            y_noisy.append(y_true[i])
        else: #we need to choose anyother label as a noisy one
            new_list = [a for a in class_list if a!=y_true[i]]
            y_noisy.append(np.random.choice(new_list))
            total_flip = total_flip + 1
    ######
    average_noise = total_noise/budget
    flip_ratio= total_flip/budget

    return y_noisy, max_noise, flip_ratio, average_noise, noise_list
######################################################################################
def noisy_OLA(y_true, model_entropy_query):
    Acc = copy.copy(A_boosted)
    class_list=list(range(c))
    n = len(y_true)
    model_entropy_query_sorted = np.sort(model_entropy_query)
    model_entropy_query_sorted = model_entropy_query_sorted.tolist()
    Acc = copy.copy(A_boosted)
    len_diff = len(Acc) - n
    max_noise = 0
    total_noise = 0
    total_flip =0
    noise_list = []
    y_noisy=[]
    for i in range(n):
        idx = model_entropy_query_sorted.index( model_entropy_query[i] )
        model_entropy_query_sorted.pop(idx)
        acc_selected_labeler = Acc.pop((idx+len_diff))
        noise = epsilon(acc_selected_labeler, model_entropy_query[i])
        if noise > max_noise:
            max_noise = noise
        total_noise = total_noise + noise
        noise_list.append(noise)
        if alpha >= noise: #if decides correctly
            y_noisy.append(y_true[i])
        else: #we need to choose anyother label as a noisy one
            new_list = [a for a in class_list if a!=y_true[i]]
            y_noisy.append(np.random.choice(new_list))
            total_flip = total_flip + 1
    ######
    average_noise = total_noise/budget
    flip_ratio = total_flip/budget
    return y_noisy, max_noise, flip_ratio, average_noise, noise_list
######################################################################################
def OLAS(U, U_entropy, y, beta):
    df = pd.DataFrame(list(zip(U, U_entropy)), columns =['index', 'entropy'])
    df = df.sort_values(by=['entropy'], ascending=False)
    last_query = -1
    class_list=list(range(c))
    A_list = sorted(A, reverse = True)
    query_indices = []
    y_noisy = []
    total_noise = 0
    total_flip = 0
    m = -1
    flag = 1
    while m < M-1 and flag == 1:
        m = m + 1
        acc = A_list[m]
        for i in np.arange(last_query + 1, len(U)):
            if epsilon(acc, df.iloc[i,1]) <= beta:
                last_query = min(i + Cap -1, len(U)-1)
                if last_query == len(U)-1 :
                    flag = 0
                for idx in np.arange(i, min(i + Cap, len(U))):
                    query_indices.append(df.iloc[idx,0])
                    noise = epsilon(acc, df.iloc[idx,1])
                    total_noise = total_noise + noise
                    if alpha >= noise: #if decides correctly
                        y_noisy.append(y[df.iloc[idx,0]])
                    else: #we need to choose anyother label as a noisy one
                        new_list = [a for a in class_list if a!=y[df.iloc[idx,0]]]
                        y_noisy.append(np.random.choice(new_list))
                        total_flip = total_flip + 1
                break # goes to main loop
    average_noise = total_noise/len(query_indices)
    flip_ratio = total_flip/len(query_indices)
    return query_indices, y_noisy, flip_ratio, average_noise
######################################################################################
def repeat():
    ## initialization: start with 3 pos samples and 3 negative samples
    I = list(np.arange(n_train))
    I_pos = [i for i in I if y_train[i] == 1]
    I_neg = [i for i in I if y_train[i] == 0]
    L_init_pos = random.sample(I_pos, init_number)
    L_init_neg = random.sample(I_neg, init_number)
    L_init = [*L_init_pos, *L_init_neg]
    U_init = [each for each in I if each not in L_init]

    ANR_RS_RLA = []
    ANR_RS_OLA = []
    ANR_ES_RLA = []
    ANR_ES_OLA = []
    ANR_OA = []

    ########################################################################  RS + RLA #######################
    learner = ActiveLearner(estimator=RandomForestClassifier(random_state=0),
                            query_strategy=entropy_sampling,
                            X_training=X_train[L_init],
                            y_training=y_train[L_init])
    L = L_init
    U = U_init
    for i in range(n_cycles):
        query_indices = np.random.choice(range(len(U)), size = int(budget))
        model_entropy_query= entropy(learner.predict_proba(X_train[U][query_indices]).T)
        y_true = y_train[U][query_indices]
        y_noisy, max_noise_RLA, flip_ratio_RLA, average_noise_RLA, noise_list_RLA = noisy_RLA(y_true, model_entropy_query)
        ANR_RS_RLA.append(flip_ratio_RLA)
        #######teaching
        X_AL_split = X_train[U][query_indices]
        Y_AL_split = y_noisy
        learner.teach(X=X_AL_split, y=Y_AL_split)
        #######Finding remaining pool of data
        L = [*L, *np.array(U)[query_indices]]
        U = [each for each in I if each not in L]

    ## evaluation ##
    pred_test = learner.predict(X_test)
    F1_RS_RLA_final = f1_score(y_test, pred_test, average=avg_method)
    prec_RS_RLA_final = precision_score(y_test, pred_test, average=avg_method)
    recall_RS_RLA_final = recall_score(y_test, pred_test, average=avg_method)
    acc_RS_RLA_final = accuracy_score(y_test, pred_test)
    ANR_RS_RLA_final = np.mean(ANR_RS_RLA)

    ########################################################################  RS + OLA #######################
    learner = ActiveLearner(estimator=RandomForestClassifier(random_state=0),
                            query_strategy=entropy_sampling,
                            X_training=X_train[L_init],
                            y_training=y_train[L_init])
    L = L_init
    U = U_init

    for i in range(n_cycles):
        query_indices = np.random.choice(range(len(U)), size = int(budget))
        model_entropy_query= entropy(learner.predict_proba(X_train[U][query_indices]).T)
        y_true = y_train[U][query_indices]
        y_noisy, max_noise_OLA, flip_ratio_OLA, average_noise_OLA, noise_list_OLA = noisy_OLA(y_true, model_entropy_query)
        ANR_RS_OLA.append(flip_ratio_OLA)
        #######teaching
        X_AL_split = X_train[U][query_indices]
        Y_AL_split = y_noisy
        learner.teach(X=X_AL_split, y=Y_AL_split)
        #######Finding remaining pool of data
        L = [*L, *np.array(U)[query_indices]]
        U = [each for each in I if each not in L]

    ## evaluation ##
    pred_test = learner.predict(X_test)
    F1_RS_OLA_final = f1_score(y_test, pred_test, average=avg_method)
    prec_RS_OLA_final = precision_score(y_test, pred_test, average=avg_method)
    recall_RS_OLA_final = recall_score(y_test, pred_test, average=avg_method)
    acc_RS_OLA_final = accuracy_score(y_test, pred_test)
    ANR_RS_OLA_final = np.mean(ANR_RS_OLA)

    ########################################################################  ES + RLA #######################
    learner = ActiveLearner(estimator=RandomForestClassifier(random_state=0),
                            query_strategy=entropy_sampling,
                            X_training=X_train[L_init],
                            y_training=y_train[L_init])
    L = L_init
    U = U_init

    for i in range(n_cycles):
        query_indices,query_samples = learner.query(X_train[U], n_instances = int(budget))
        model_entropy_query= entropy(learner.predict_proba(X_train[U][query_indices]).T)
        y_true = y_train[U][query_indices]
        y_noisy, max_noise_RLA, flip_ratio_RLA, average_noise_RLA, noise_list_RLA = noisy_RLA(y_true, model_entropy_query)
        ANR_ES_RLA.append(flip_ratio_RLA)
        #######teaching
        X_AL_split = X_train[U][query_indices]
        Y_AL_split = y_noisy
        learner.teach(X=X_AL_split, y=Y_AL_split)
        #######Finding remaining pool of data
        L = [*L, *np.array(U)[query_indices]]
        U = [each for each in I if each not in L]

    ## evaluation ##
    pred_test = learner.predict(X_test)
    F1_ES_RLA_final = f1_score(y_test, pred_test, average=avg_method)
    prec_ES_RLA_final = precision_score(y_test, pred_test, average=avg_method)
    recall_ES_RLA_final = recall_score(y_test, pred_test, average=avg_method)
    acc_ES_RLA_final = accuracy_score(y_test, pred_test)
    ANR_ES_RLA_final = np.mean(ANR_ES_RLA)

    ########################################################################  ES + OLA #######################
    learner = ActiveLearner(estimator=RandomForestClassifier(random_state=0),
                            query_strategy=entropy_sampling,
                            X_training=X_train[L_init],
                            y_training=y_train[L_init])
    L = L_init
    U = U_init

    for i in range(n_cycles):
        query_indices,query_samples = learner.query(X_train[U], n_instances = int(budget))
        model_entropy_query= entropy(learner.predict_proba(X_train[query_indices]).T)
        y_true = y_train[U][query_indices]
        y_noisy, max_noise_OLA, flip_ratio_OLA, average_noise_OLA, noise_list_OLA = noisy_OLA(y_true, model_entropy_query)
        ANR_ES_OLA.append(flip_ratio_OLA)
        #######teaching
        X_AL_split = X_train[U][query_indices]
        Y_AL_split = y_noisy
        learner.teach(X=X_AL_split, y=Y_AL_split)
        #######Finding remaining pool of data
        L = [*L, *np.array(U)[query_indices]]
        U = [each for each in I if each not in L]
    ## evaluation ##
    pred_test = learner.predict(X_test)
    F1_ES_OLA_final = f1_score(y_test, pred_test, average=avg_method)
    prec_ES_OLA_final = precision_score(y_test, pred_test, average=avg_method)
    recall_ES_OLA_final = recall_score(y_test, pred_test, average=avg_method)
    acc_ES_OLA_final = accuracy_score(y_test, pred_test)
    ANR_ES_OLA_final = np.mean(ANR_ES_OLA)

    ########################################################################  OLAS #######################
    learner = ActiveLearner(estimator=RandomForestClassifier(random_state=0),
                            query_strategy=entropy_sampling,
                            X_training=X_train[L_init],
                            y_training=y_train[L_init])
    L = L_init
    U = U_init
    for i in range(n_cycles):
        U_entropy= entropy(learner.predict_proba(X_train[U]).T)
        query_indices, y_noisy, flip_ratio, average_noise = OLAS(U, U_entropy,y_train, beta)
        ANR_OA.append(flip_ratio)
        #######teaching
        X_AL_split = X_train[query_indices]
        Y_AL_split = y_noisy
        learner.teach(X=X_AL_split, y=Y_AL_split)
        #######Finding remaining pool of data
        L = [*L, *query_indices]
        U = [each for each in I if each not in L]
    ## evaluation ##
    pred_test = learner.predict(X_test)
    F1_OA_final = f1_score(y_test, pred_test, average=avg_method)
    prec_OA_final = precision_score(y_test, pred_test, average=avg_method)
    recall_OA_final = recall_score(y_test, pred_test, average=avg_method)
    acc_OA_final = accuracy_score(y_test, pred_test)
    ANR_OA_final = np.mean(ANR_OA)

    return  F1_RS_RLA_final, F1_RS_OLA_final, F1_ES_RLA_final, F1_ES_OLA_final, F1_OA_final, prec_RS_RLA_final, prec_RS_OLA_final, prec_ES_RLA_final, prec_ES_OLA_final, prec_OA_final, recall_RS_RLA_final, recall_RS_OLA_final, recall_ES_RLA_final, recall_ES_OLA_final, recall_OA_final, acc_RS_RLA_final, acc_RS_OLA_final, acc_ES_RLA_final, acc_ES_OLA_final, acc_OA_final, ANR_RS_RLA_final, ANR_RS_OLA_final, ANR_ES_RLA_final, ANR_ES_OLA_final, ANR_OA_final



**Running AL:**

In [None]:
## Data sets can be downloaded from: https://archive.ics.uci.edu/
Data_Sets = ['Spambase', 'Heart Statlog' ,'Ionosphere','Sonar']
Noise_Models = ['NM1']

## parameters
replication_num = 100
init_number = 3 # pick 3 from each class to initialize classifier
n_cycles = 10
alpha = 0.2


for data in Data_Sets:
    ### reading data
    X, y = data_read(data)
    ### data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)
    c = len(set(y_train))
    if c == 2:
        avg_method = 'binary'
    else:
        avg_method = 'macro'
    n_train = X_train.shape[0]
    ################ define dynamic parameters
    budget = np.ceil(0.7*(n_train - 6) / 10)
    Cap = int(np.floor(np.sqrt(budget)))
    M = int(np.ceil(budget / Cap))
    ### accuracy list
    np.random.seed()
    A = np.random.uniform(low=0.5, high=0.95, size=M)
    A = np.sort(A)
    A_boosted = np.array([[a]*Cap for a in A])
    A_boosted = np.reshape(A_boosted, (M*Cap))
    A_boosted = A_boosted.tolist()
    beta = 0.2
    ############################################
    for nm in Noise_Models:
        ### defining noise functions
        if nm == 'NM1':
            def epsilon(a, e): #noise model 1
                if e <= 0:
                    e = 0
                elif e >= 1:
                    e = 1
                return e*(1-a)
        elif nm == "NM2":
            def epsilon(a, e): #noise model 2
                if e <= 0:
                    e = 0
                elif e >= 1:
                    e = 1
                h = 0.4*e + 0.3
                if e >=0 and e <= 0.5:
                    p = 2*h
                    y = (1-a**p)**(1/p)
                else:
                    p = 2*(1-h)
                    y = 1-(1-(1-a)**p)**(1/p)
                return y

        # Initialize a list to collect results for each replication
        results_list = []

        for r in range(replication_num):
            res = repeat()
            # Append each row as a dictionary to the results list
            results_list.append({
                'replication': r + 1,
                'F1 RS_RLA': res[0], 'F1 RS_OLA': res[1], 'F1 ES_RLA': res[2], 'F1 ES_OLA': res[3], 'F1 OA': res[4],
                'precision RS_RLA': res[5], 'precision RS_OLA': res[6], 'precision ES_RLA': res[7], 'precision ES_OLA': res[8], 'precision OA': res[9],
                'recall RS_RLA': res[10], 'recall RS_OLA': res[11], 'recall ES_RLA': res[12], 'recall ES_OLA': res[13], 'recall OA': res[14],
                'accuracy RS_RLA': res[15], 'accuracy RS_OLA': res[16], 'accuracy ES_RLA': res[17], 'accuracy ES_OLA': res[18], 'accuracy OA': res[19],
                'anr RS_RLA': res[20], 'anr RS_OLA': res[21], 'anr ES_RLA': res[22], 'anr ES_OLA': res[23], 'anr OA': res[24]
            })

        # Convert results_list to a DataFrame
        results = pd.DataFrame(results_list)
        results.to_excel('{}_{}.xlsx'.format(data, nm), index=False)

        # Create the settings DataFrame without using append
        settings = pd.DataFrame([{
            'alpha': alpha, 'beta': beta, 'budget': budget, 'M': M, 'capacity': Cap
        }])
        settings.to_excel('{}_{}_Settings.xlsx'.format(data, nm), index=False)


        print("Data set {} with noise model {} is done!".format(data,nm))

**Outputting Results:**

In [None]:
Data_Sets = ['Heart Statlog' ,'Ionosphere','Sonar','Spambase'] #'Heart Statlog' ,'Ionosphere','Sonar','Spambase'
Noise_Models = ['NM1']

for nm in Noise_Models:
    print("________________ noise model:",nm)
    for data in Data_Sets:
        print("*********data set: ",data)
        df = pd.read_excel('{}_{}.xlsx'.format(data, nm))
        print("RS_RLA:   ${} \pm {}$ ".format(round(df['F1 RS_RLA'].mean(),3), round(df['F1 RS_RLA'].std(),3)))
        print("RS_OLA:   ${} \pm {}$ ".format(round(df['F1 RS_OLA'].mean(),3), round(df['F1 RS_OLA'].std(),3)))
        print("ES_RLA:   ${} \pm {}$ ".format(round(df['F1 ES_RLA'].mean(),3), round(df['F1 ES_RLA'].std(),3)))
        print("ES_OLA:   ${} \pm {}$ ".format(round(df['F1 ES_OLA'].mean(),3), round(df['F1 ES_OLA'].std(),3)))
        print("OLAS:   ${} \pm {}$ ".format(round(df['F1 OA'].mean(),3), round(df['F1 OA'].std(),3)))
