# HLML: Project
Dennis Agafonov (12528269)

This code is the combined work of Dennis Agafonov and Jelke Matthijsse.

Import all necessary packages and libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations

# sklearn imports for classification
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For query-by-committee train different models
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

Load a subset from the MNIST dataset from sklearn

In [None]:
# subset of the MNIST dataset from sklearn
digits = load_digits()
# 60-20-20 split
X_train, X_temp, y_train, y_temp = train_test_split(digits.data, digits.target, test_size=0.40, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
def compute_acc(x, y):
    '''
    Function that computes the accuracy score of a trained model given data
    
    :param x: features of datapoint
    :param y: corresponding labels of datapoints
    
    :return: accuracy of trained regression model on given data, and trained model.
    '''
    # initialize and fit logistic regression on data
    target_clf = LogisticRegression(solver='saga',max_iter=10000, random_state=42)
    target_clf.fit(x, y)
    
    # predict new labels
    y_pred = target_clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    return acc, target_clf

Computing the acquisition score for each point:

In [None]:
def get_entropy_score(point, model):
    '''
    Function that computes the entropy score of a datapoint
    '''
    # get prob and log prob of point
    x = point[0].reshape(1, -1)
    log_prob = model.predict_log_proba(x)[0]
    prob = model.predict_proba(x)[0]
        
    # calculate entropy
    entropy = -1 * np.sum(np.multiply(prob, log_prob))
        
    return entropy

def get_qbc_score(point, model1, model2, model3, model4):
    '''
    Function that computes the query-by-committee score of a datapoint
    '''
    QBC = 0
    
    # get prediction for datapoint for every model
    x = point[0].reshape(1, -1)
    pred_1 = model1.predict(x)
    pred_2 = model2.predict(x)
    pred_3 = model3.predict(x)
    pred_4 = model4.predict(x)
    models = [pred_1, pred_2, pred_3, pred_4]
    
    # calculate qbc as number of disagreements between models
    for comb in combinations(models, 2):
        if comb[0] != comb[1]:
            QBC +=1
        
    return QBC

These provide that are necessary to compute the Query-by-Committee acquisition score. 

In [None]:
def fit_models(dataset):

    '''
    Function that retrains the different committee of models
    '''
    # unzip the current dataset into x and y
    X_train, y_train = zip(*dataset)
    
    # fit support-vector-machine
    model1 = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel ='rbf', random_state=42))
    model1.fit(X_train, y_train)

    # fit naive bayes classifier
    model2 = GaussianNB()
    model2.fit(X_train, y_train)                                             

    # fit decision-tree model
    model3 = DecisionTreeClassifier(random_state=42)
    model3.fit(X_train, y_train)

    # fit logistic regression model
    model4 = LogisticRegression(solver='saga',max_iter=10000, random_state=42)
    model4.fit(X_train, y_train)
    
    return model1, model2, model3, model4

In [None]:
def get_datapoints(pool, curr_model, acquisition, batch_size, model1, model2, model3, model4):
    '''
    Function that returns the datapoints with the highest uncertainty
    
    :param pool: current pool set from which datapoint can be chosen
    :param curr_model: current model trained on dataset from previous iteration (used to obtain pred for candidate datapoint)
    :param acquisition: str that says which acquisition function (uncertainty sampling or QBC) to use
    :param batch_size: number of datapoints that are extracted 
    
    :return: batch_size number of datapoints, and the updated poolset
    '''
    uncertainty_per_point = {}
    
    # loop through all points in pool 
    for i, point in enumerate(pool):
        
        if acquisition == 'entropy':
            score = get_entropy_score(point, curr_model)
        elif acquisition == 'qbc':
            score = get_qbc_score(point, model1, model2, model3, model4)
        
        # add index of point (key) and corresponding entropy (value) to dct
        uncertainty_per_point[i] = score
        
    # obtain index of point that has highest entropy
    max_point_idx = sorted(uncertainty_per_point, key=lambda x: uncertainty_per_point[x], reverse=True)[:batch_size]
    
    # corresponding datapoint of idx
    datapoints = [pool[i] for i in max_point_idx]
    
    for index in sorted(max_point_idx, reverse=True):
        pool.pop(index)
    
    return datapoints, pool

In [None]:
def active_learning(acquisition, batch_size, nr_labeled, retrain_it):
    train = list(zip(X_train, y_train))
    current_dataset = train[:nr_labeled]
    pool_set = train[nr_labeled:]
    
    model1, model2, model3, model4 = fit_models(current_dataset)
    val_accuracies = []
    moving_avg_window = 7  # window size for moving average
    moving_avg_threshold = 0.0001  # threshold for change in moving average
    prev_moving_avg = 0  # initialize previous moving average
    i = 0
    while pool_set:
        current_X, current_y = zip(*current_dataset)
        val_acc, current_clf = compute_acc(current_X, current_y)
        val_accuracies.append(val_acc)

        # Compute moving average
        if len(val_accuracies) >= moving_avg_window:
            current_moving_avg = sum(val_accuracies[-moving_avg_window:]) / moving_avg_window
            if abs(current_moving_avg - prev_moving_avg) < moving_avg_threshold:
                print('Validation accuracy plateaued; stopping!')
                break
            prev_moving_avg = current_moving_avg

        new_points, pool_set = get_datapoints(pool_set, current_clf, acquisition, batch_size, model1, model2, model3, model4)
        current_dataset = current_dataset + new_points

        if (i+1) % retrain_it == 0 and acquisition == 'qbc':
            model1, model2, model3, model4 = fit_models(current_dataset)

        i += 1

    if len(pool_set) == 0:
        print('Ran out of points in pool set; stopping!')

    return val_accuracies, current_clf

In [None]:
val_accs_qbc = []
test_accs_qbc = []
val_accs_ent = []
test_accs_ent = []
batch_sizes = [1,5,10,15,20,50,100]
for batch_size in batch_sizes:
    val_acc_qbc, final_model_qbc = active_learning('qbc', batch_size, 40, 5)
    test_score_qbc = final_model_qbc.score(X_test, y_test)
    val_acc_ent, final_model_ent = active_learning('entropy', batch_size, 40, 5)
    test_score_ent = final_model_ent.score(X_test, y_test)
    val_accs_qbc.append(val_acc_qbc)
    val_accs_ent.append(val_acc_ent)
    test_accs_qbc.append(test_score_qbc)
    test_accs_ent.append(test_score_ent)

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12, 12))

for sublist, batch_size in zip(val_accs_qbc, batch_sizes):
    ax1.plot(sublist, label=f"Batch size {batch_size}",  marker='o')
ax1.set_title("Validation accuracy for each batch size (AF: QBC)")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Accuracy")
ax1.legend()

for sublist, batch_size in zip(val_accs_ent, batch_sizes):
    ax2.plot(sublist, label=f"Batch size {batch_size}",  marker='o')
ax2.set_title("Validation accuracy for each batch size (AF: US)")
ax2.set_xlabel("Iteration")
ax2.set_ylabel("Accuracy")
ax2.legend()

plt.tight_layout() 
plt.show()

In [None]:
print(test_accs_qbc)
print(test_accs_ent)