In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.disagreement import vote_entropy_sampling, max_std_sampling
import matplotlib.pyplot as plt
import torch
from torch.nn import MSELoss
from skorch.regressor import NeuralNetRegressor
from copy import deepcopy
from skorch.dataset import ValidSplit
from skorch.callbacks import EarlyStopping
from sklearn.cluster import DBSCAN, OPTICS, cluster_optics_dbscan, Birch, SpectralClustering
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics
import matplotlib.gridspec as gridspec
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from torch.nn import Linear
from torch.nn import Sigmoid, ReLU
from torch.nn import Module
from sklearn.metrics import r2_score
import copy
from scipy.spatial import distance
from scipy.spatial.distance import euclidean
from torch import tensor
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist
from sklearn.ensemble import RandomForestRegressor
from torch.nn.init import xavier_uniform_
from model import MLP
import random
from skorch.callbacks import EarlyStopping, LRScheduler, Freezer, Unfreezer
from torch.optim.lr_scheduler import CyclicLR, ReduceLROnPlateau
from skorch.regressor import NeuralNetRegressor, NeuralNet
from modAL.utils.selection import multi_argmax
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [2]:
query_number = 20     # The number of the AL Iterations in each Exp
iteration = 20         # Total number of the Exps
batch_size = 40
total_initail_size = 40 # 96
initial_size = 5

seed_rf = np.load(file="..\..\seed2.npy")
seed_initial = np.load(file="..\..\seed3.npy")

seed_nn1 = np.load(file="..\..\seed4.npy")
seed_nn2 = np.load(file="..\..\seed4.npy")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def get_closest_dist(unlabeled_data, labeled_data):

    # Should be global info instead of within the clusters
    d = cdist(unlabeled_data, labeled_data, 'euclidean')
    d_x_min = d.min(axis=1)

    return d_x_min

def get_closest_dist_xy(unlabeled_predictions, labeled_data_label, unlabeled_data, labeled_data):

    d_y = cdist(np.array(unlabeled_predictions).reshape(-1, 1), labeled_data_label, 'euclidean')
    d_x = cdist(unlabeled_data, labeled_data, 'euclidean')
    d_xy = np.multiply(d_x, d_y)
    d_xy_min = d_xy.min(axis=1)

    return d_xy_min

def custom_query_strategy(used_data, X_pool, prediction_, used_label, n_instances):
    d_xy = get_closest_dist_xy(prediction_, used_label, X_pool, used_data)
    utility = d_xy
    return multi_argmax(utility, n_instances=n_instances)

In [4]:
def main_function(query_number, iters):
    # For saving results:
    rf_model_training_r2 = []
    rf_model_training_mse = []
    rf_model_testing_r2 = []
    rf_model_testing_mse = []
    
    # The model for evaluation:
    rf_model = RandomForestRegressor(random_state=seed_rf[iters], n_estimators=100)

    # Load the data:
    name1 = "..\..\Datasets\Gold_Nano\X_train" + str(iters) + ".npy"
    name2 = "..\..\Datasets\Gold_Nano\X_test" + str(iters) + ".npy"
    name3 = "..\..\Datasets\Gold_Nano\y_train" + str(iters) + ".npy"
    name4 = "..\..\Datasets\Gold_Nano\y_test" + str(iters) + ".npy"
    
    X_train = np.load(name1, allow_pickle=True).astype(np.float32)
    X_test = np.load(name2, allow_pickle=True).astype(np.float32)
    y_train = np.load(name3, allow_pickle=True).astype(np.float32).reshape(-1, 1)
    y_test = np.load(name4, allow_pickle=True).astype(np.float32).reshape(-1, 1)
    
    # Feature Dim；
    X = X_train.shape[1]
    # The unlabeled pool
    X_index = np.arange(X_train.shape[0])
    
    # Queried samples and labels
    used_data = np.empty(shape=(0, X))
    used_label = np.empty(shape=(0)).reshape(-1, 1)
    
    # Initial samples and labels
    X_initial = np.empty(shape=(0,X))
    y_initial = np.empty(shape=(0)).reshape(-1, 1)

    # Initial Stage 1:
    np.random.seed(seed_initial[iters])
    idx = np.random.choice(range(len(X_index)), size=initial_size, replace=False)
    train_idx = X_index[idx]

    X_initial = X_train[train_idx]
    y_initial = y_train[train_idx].reshape(-1, 1)

    used_data = np.append(used_data, X_initial, axis=0).astype(np.float32)
    used_label = np.append(used_label, y_initial, axis=0).astype(np.float32).reshape(-1, 1)
    X_index = np.delete(X_index, idx, axis=0)
    
    # Initial Sagte 2:
    rest_initial_X = np.empty(shape=(0,X))
    rest_initial_y = np.empty(shape=(0)).reshape(-1, 1)
    
    np.random.seed(seed_initial[iters])
    idx = np.random.choice(range(len(X_index)), size=total_initail_size-initial_size, replace=False)
    train_idx = X_index[idx]
    
    rest_initial_X = np.append(rest_initial_X, X_train[train_idx], axis=0).astype(np.float32)
    rest_initial_y = np.append(rest_initial_y, y_train[train_idx], axis=0).astype(np.float32).reshape(-1, 1)

    X_index = np.delete(X_index, idx, axis=0)
    
    used_data = np.append(used_data, rest_initial_X, axis=0).astype(np.float32)
    used_label = np.append(used_label, rest_initial_y, axis=0).astype(np.float32).reshape(-1, 1)
    
    # Finished the Initialization Stages:
    # RF Regressor for evaluation:
    rf_model.fit(used_data, used_label.ravel())
    # Training Scores:
    rf_training_r2 = r2_score(used_label, rf_model.predict(used_data))
    rf_training_mse = mean_squared_error(used_label, rf_model.predict(used_data))
    rf_model_training_r2.append(rf_training_r2)
    rf_model_training_mse.append(rf_training_mse)
    
    # Test Scores:
    rf_model_r2 = r2_score(y_test, rf_model.predict(X_test))
    rf_model_mse = mean_squared_error(y_test, rf_model.predict(X_test))
    rf_model_testing_r2.append(rf_model_r2)
    rf_model_testing_mse.append(rf_model_mse)
    
    np.random.seed(seed_nn1[iters])
    torch.manual_seed(seed_nn2[iters])
    predictor = NeuralNetRegressor(MLP(X),
                                   criterion=MSELoss(),
                                   optimizer=torch.optim.Adam,
                                   verbose=0,
                                   max_epochs=100,
                                   lr=0.001,
                                   # Used for the batch AL
                                   callbacks=[EarlyStopping(patience=20), ('lr_scheduler', LRScheduler(policy=ReduceLROnPlateau))],
                                   train_split=ValidSplit(cv=5),
                                   warm_start=False,
                                   device='cuda',
                                   batch_size = 200
                                   )
    used_data = torch.from_numpy(used_data).to(device)
    used_label = torch.from_numpy(used_label).to(device)
    # Train:
    predictor.fit(used_data, used_label)
    print("NN Initialization Scores:", r2_score(y_test, predictor.predict(X_test)))
    
    print("After Initialization RF R2:", rf_model_r2)

    for idx in range(query_number):
        np.random.seed(None)
        
        print('Query no. %d' % (idx+1))
        
        used_data = used_data.cpu().numpy()
        used_label = used_label.cpu().numpy()
        predictions = predictor.predict(torch.from_numpy(X_train[X_index]).to(device))
        idx = custom_query_strategy(used_data, X_train[X_index], predictions, used_label, batch_size)

        # Query the new sample:
        X_train_index = X_index[idx]

        new_X = X_train[X_train_index].reshape(batch_size, -1)
        new_y = y_train[X_train_index].reshape(batch_size, -1)

        # Adding the used data to the used_data pool
        
        used_data = np.append(used_data, new_X, axis=0).astype(np.float32)
        used_label = np.append(used_label, new_y, axis=0).astype(np.float32).reshape(-1, 1)

        # RF Evaluation
        rf_model.fit(used_data, used_label.ravel())
        
        # Training Evaluation:
        rf_training_r2 = r2_score(used_label, rf_model.predict(used_data))
        rf_training_mse = mean_squared_error(used_label, rf_model.predict(used_data))
        rf_model_training_r2.append(rf_training_r2)
        rf_model_training_mse.append(rf_training_mse)
        
        # Test Evaluation:
        rf_model_r2 = r2_score(y_test, rf_model.predict(X_test))
        rf_model_mse = mean_squared_error(y_test, rf_model.predict(X_test))
        rf_model_testing_r2.append(rf_model_r2)
        rf_model_testing_mse.append(rf_model_mse)

        # remove queried instance from pool
        X_index = np.delete(X_index, idx, axis=0)
        
        print(np.unique(used_data, axis=0).shape)
        print("RF R2:", rf_model_r2)
        print("Remaining:", X_index.shape[0])
        
        used_data = torch.from_numpy(used_data).to(device)
        used_label = torch.from_numpy(used_label).to(device)
        predictor.fit(used_data, used_label)


    # RF
    rf_model_testing_r2 = np.array(rf_model_testing_r2)
    rf_model_testing_mse = np.array(rf_model_testing_mse)
    rf_model_training_r2 = np.array(rf_model_training_r2)
    rf_model_training_mse = np.array(rf_model_training_mse)
    
    used_data = used_data.cpu().numpy()
    used_label = used_label.cpu().numpy()
    # Used_data and Unsed_label:
    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\used_data" + str(iters) + ".npy", arr=used_data)
    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\used_labels" + str(iters) + ".npy", arr=used_label)

    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\testing_rf_r2_" + str(iters) + ".npy", arr=rf_model_testing_r2)
    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\testing_rf_mse_" + str(iters) + ".npy", arr=rf_model_testing_mse)
    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\training_rf_r2_" + str(iters) + ".npy", arr=rf_model_training_r2)
    np.save(file="..\..\Results\Res_Gold\Greedy1\Summary\\training_rf_mse_" + str(iters) + ".npy", arr=rf_model_training_mse)

In [5]:
for i in range(iteration):
    print("The Iteration is ", i)
    main_function(query_number, i)

The Iteration is  0
NN Initialization Scores: -0.7774960687130186
After Initialization RF R2: 0.8928400977917432
Query no. 1
(136, 182)
RF R2: 0.8932615218649245
Remaining: 3063
Query no. 2
(176, 182)
RF R2: 0.9086251981490503
Remaining: 3023
Query no. 3
(216, 182)
RF R2: 0.9399829012943245
Remaining: 2983
Query no. 4
(256, 182)
RF R2: 0.9511560298286089
Remaining: 2943
Query no. 5
(296, 182)
RF R2: 0.9520579885586246
Remaining: 2903
Query no. 6
(336, 182)
RF R2: 0.9520656678542576
Remaining: 2863
Query no. 7
(376, 182)
RF R2: 0.956740284910274
Remaining: 2823
Query no. 8
(416, 182)
RF R2: 0.9589397039416498
Remaining: 2783
Query no. 9
(456, 182)
RF R2: 0.9581403682322286
Remaining: 2743
Query no. 10
(496, 182)
RF R2: 0.9612665096730675
Remaining: 2703
Query no. 11
(536, 182)
RF R2: 0.9687556825238441
Remaining: 2663
Query no. 12
(576, 182)
RF R2: 0.9771686142293862
Remaining: 2623
Query no. 13
(616, 182)
RF R2: 0.9783668541704037
Remaining: 2583
Query no. 14
(656, 182)
RF R2: 0.986745