In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.disagreement import vote_entropy_sampling, max_std_sampling
import matplotlib.pyplot as plt
import torch
from torch.nn import MSELoss
from skorch.regressor import NeuralNetRegressor
from copy import deepcopy
from skorch.dataset import ValidSplit
from skorch.callbacks import EarlyStopping
from sklearn.cluster import DBSCAN, OPTICS, cluster_optics_dbscan, Birch, SpectralClustering
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics
import matplotlib.gridspec as gridspec
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from torch.nn import Linear
from torch.nn import Sigmoid, ReLU
from torch.nn import Module
from sklearn.metrics import r2_score
import copy
from scipy.spatial import distance
from scipy.spatial.distance import euclidean
from torch import tensor
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist
from sklearn.ensemble import RandomForestRegressor
from torch.nn.init import xavier_uniform_
from model import MLP
from model_state import get_model_params, get_input_for_hidden_layers
from functions import clustering, sample_the_cloest_data, data_input_space_diversity, minimum_distance_to_used_data, sample_by_input_diversity
from functions import get_variance, qbc, normalization, error_reduct_fuc, total_disagrement
import random
from skorch.callbacks import EarlyStopping, LRScheduler, Freezer, Unfreezer
from torch.optim.lr_scheduler import CyclicLR, ReduceLROnPlateau
from skorch.regressor import NeuralNetRegressor, NeuralNet
from model_state import features_concat, learning_state_features_concat
from modAL.utils.selection import multi_argmax
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [2]:
NN_setting = [50, 30, 15]  # Inner model size
query_number = 20     # The number of the AL Iterations in each Exp
iteration = 50         # Total number of the Exps
batch_size = 10
total_initail_size = 10
initial_size = 5

seed_rf = np.load(file="..\..\Datasets\Diabetes\seed_for_RF.npy")
seed_initial = np.load(file="..\..\Datasets\Diabetes\seed_for_initialSamples.npy")
seed_nn = np.load(file="..\..\Datasets\Diabetes\seed_for_nn1.npy")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def get_closest_dist(unlabeled_data, labeled_data):

    # Should be global info instead of within the clusters
    d = cdist(unlabeled_data, labeled_data, 'euclidean')
    d_x_min = d.min(axis=1)

    return d_x_min

In [4]:
def get_closest_dist_xy(unlabeled_predictions, labeled_data_label, unlabeled_data, labeled_data):

    d_y = cdist(np.array(unlabeled_predictions).reshape(-1, 1), labeled_data_label, 'euclidean')
    d_x = cdist(unlabeled_data, labeled_data, 'euclidean')
    d_xy = np.multiply(d_x, d_y)
    d_xy_min = d_xy.min(axis=1)

    return d_xy_min

In [5]:
def custom_query_strategy(model, X_pool, used_data, used_label, n_instances):
    prediction_ = model.predict(X_pool)
    d_xy = get_closest_dist_xy(prediction_, used_label, X_pool, used_data)
    utility = d_xy
    return multi_argmax(utility, n_instances=n_instances)

In [6]:
def main_function(query_number, iters):

    # Set Seeds here:
    # torch.manual_seed(seed_nn[iter])
    # np.random.seed(seed_initial[iters])
    rf_model = RandomForestRegressor(random_state=seed_rf[iters], n_estimators=100)

    # Load the data:
    name1 = "..\..\diabetes_dataset_20test\X_train_" + str(iters) + ".npy"
    name2 = "..\..\diabetes_dataset_20test\X_test_" + str(iters) + ".npy"
    name3 = "..\..\diabetes_dataset_20test\y_train_" + str(iters) + ".npy"
    name4 = "..\..\diabetes_dataset_20test\y_test_" + str(iters) + ".npy"
    
    X_train = np.load(name1).astype(np.float32)
    X_test = np.load(name2).astype(np.float32)
    y_train = np.load(name3).astype(np.float32).reshape(-1, 1)
    y_test = np.load(name4).astype(np.float32).reshape(-1, 1)
    X = X_train.shape[1]
    X_index = np.arange(X_train.shape[0])
    
    used_data = np.empty(shape=(0, X))
    used_label = np.empty(shape=(0)).reshape(-1, 1)

    X_initial = np.empty(shape=(0,X))
    y_initial = np.empty(shape=(0)).reshape(-1, 1)
    
    # Initial samples:
    initial_size = 5
    np.random.seed(seed_initial[iters])
    idx = np.random.choice(range(len(X_index)), size=initial_size, replace=False)
    train_idx = X_index[idx]

    X_initial = X_train[train_idx]
    y_initial = y_train[train_idx].reshape(-1, 1)

    used_data = np.append(used_data, X_initial, axis=0).astype(np.float32)
    used_label = np.append(used_label, y_initial, axis=0).astype(np.float32).reshape(-1, 1)
    X_index = np.delete(X_index, idx, axis=0)

    # RF Prediction Res:
    rf_model.fit(used_data, used_label.ravel())
    rf_training_r2 = r2_score(used_label, rf_model.predict(used_data))
    rf_training_mse = mean_squared_error(used_label, rf_model.predict(used_data))
    rf_model_training_r2 = [rf_training_r2]
    rf_model_training_mse = [rf_training_mse]

    rf_model_prediction = rf_model.predict(X_test)
    rf_model_r2 = r2_score(y_test, rf_model_prediction)
    rf_model_mse = mean_squared_error(y_test, rf_model_prediction)
    rf_model_testing_performance = [rf_model_r2]
    rf_model_testing_mse = [rf_model_mse]

    # other random sampling:
    sampled_index = np.empty(shape=(0))
    rest_initial_X = np.empty(shape=(0,X))
    rest_initial_y = np.empty(shape=(0)).reshape(-1, 1)
    
    np.random.seed(seed_initial[iters])
    idx = np.random.choice(range(len(X_index)), size=total_initail_size-5, replace=False)
    train_idx = X_index[idx]
    sampled_index = np.append(sampled_index, train_idx, axis=0).astype(np.int32)

    rest_initial_X = np.append(rest_initial_X, X_train[train_idx], axis=0).astype(np.float32)
    rest_initial_y = np.append(rest_initial_y, y_train[train_idx], axis=0).astype(np.float32).reshape(-1, 1)

    X_index = np.delete(X_index, idx, axis=0)
    
    used_data = np.append(used_data, rest_initial_X, axis=0).astype(np.float32)
    used_label = np.append(used_label, rest_initial_y, axis=0).astype(np.float32).reshape(-1, 1)

    
    # Initial samples:
    # The first X' which is closest to the center
#     kmeans = KMeans(n_clusters=1).fit(X_train)
#     centers = np.array(kmeans.cluster_centers_)
#     closest, _ = pairwise_distances_argmin_min(centers, X_train)
#     close = closest[0]

#     used_data = np.append(used_data, X_train[close].reshape(-1, X), axis=0).astype(np.float32)
#     used_label = np.append(used_label, y_train[close].reshape(-1, 1), axis=0).astype(np.float32).reshape(-1, 1)

#     X_index = np.delete(X_index, close, axis=0)

#     # Remaining candidate instances:
#     Pool_X = X_index
#     remaining_samples = len(Pool_X)
#     Pool_X = [int(x) for x in Pool_X]
#     remaining_data = X_train[Pool_X]
#     remaining_label = y_train[Pool_X]
    
#     rf_model_training_r2 = []
#     rf_model_training_mse = []
#     rf_model_testing_performance = []
#     rf_model_testing_mse = []

#     # GSx:
#     for i in range(total_initail_size-1):

#         d_nm = get_closest_dist(remaining_data, used_data)
#         max_value = max(d_nm)
#         idx = np.where(d_nm == max_value)
        
#         used_data = np.append(used_data, remaining_data[idx], axis=0).astype(np.float32)
#         used_label = np.append(used_label, remaining_label[idx], axis=0).astype(np.float32).reshape(-1, 1)

#         Pool_X = np.delete(Pool_X, idx, axis=0)
#         remaining_data = X_train[Pool_X]
#         remaining_label = y_train[Pool_X]
#         remaining_samples = remaining_samples - 1

    # RF Regressor:
    rf_model.fit(used_data, used_label.ravel())

    rf_training_r2 = r2_score(used_label, rf_model.predict(used_data))
    rf_training_mse = mean_squared_error(used_label, rf_model.predict(used_data))
    rf_model_training_r2.append(rf_training_r2)
    rf_model_training_mse.append(rf_training_mse)

    rf_model_r2 = r2_score(y_test, rf_model.predict(X_test))
    rf_model_mse = mean_squared_error(y_test, rf_model.predict(X_test))
    rf_model_testing_performance.append(rf_model_r2)
    rf_model_testing_mse.append(rf_model_mse)
    
    print("After Initialization", r2_score(y_test, rf_model.predict(X_test)))
    print(np.unique(used_data, axis=0).shape)
    
    Pool_X = X_index
    
    n_queries = query_number
    np.random.seed(None)
    rr = RandomForestRegressor(n_estimators=1000)
    for idx in range(n_queries):
        np.random.seed(None)
        print('Query no. %d' % (idx+1))
        rr.fit(used_data, used_label.ravel())
        
        idx = custom_query_strategy(rr, X_train[Pool_X], used_data, used_label, batch_size)
        
        # Query the new sample:
        X_train_index = Pool_X[idx]

        new_X = X_train[X_train_index].reshape(batch_size, -1)
        new_y = y_train[X_train_index].reshape(batch_size, -1)

        # Adding the used data to the used_data pool
        used_data = np.append(used_data, new_X, axis=0).astype(np.float32)
        used_label = np.append(used_label, new_y, axis=0).astype(np.float32).reshape(-1, 1)

        # RF Regressor:
        rf_model.fit(used_data, used_label.ravel())

        rf_training_r2 = r2_score(used_label, rf_model.predict(used_data))
        rf_training_mse = mean_squared_error(used_label, rf_model.predict(used_data))
        rf_model_training_r2.append(rf_training_r2)
        rf_model_training_mse.append(rf_training_mse)

        rf_model_r2 = r2_score(y_test, rf_model.predict(X_test))
        rf_model_mse = mean_squared_error(y_test, rf_model.predict(X_test))
        rf_model_testing_performance.append(rf_model_r2)
        rf_model_testing_mse.append(rf_model_mse)

        # remove queried instance from pool
        Pool_X = np.delete(Pool_X, idx, axis=0)
        remaining_data = X_train[Pool_X]
        
        print(np.unique(used_data, axis=0).shape)
        print("RF R2:", rf_model_r2)


    # RF
    rf_model_testing_performance = np.array(rf_model_testing_performance)
    rf_model_testing_mse = np.array(rf_model_testing_mse)
    rf_model_training_r2 = np.array(rf_model_training_r2)
    rf_model_training_mse = np.array(rf_model_training_mse)

    np.save(file="Res_Diabetes\\Greedy1\\Summary\\testing_rf_r2_Greedy-" + str(iters) + ".npy", arr=rf_model_testing_performance)
    np.save(file="Res_Diabetes\\Greedy1\\Summary\\testing_rf_mse_Greedy-" + str(iters) + ".npy", arr=rf_model_testing_mse)
    np.save(file="Res_Diabetes\\Greedy1\\Summary\\training_rf_r2_Greedy-" + str(iters) + ".npy", arr=rf_model_training_r2)
    np.save(file="Res_Diabetes\\Greedy1\\Summary\\training_rf_mse_Greedy-" + str(iters) + ".npy", arr=rf_model_training_mse)

    total_rf_r2.append(rf_model_testing_performance)
    total_rf_mse.append(rf_model_testing_mse)
    total_rf_training_r2.append(rf_model_training_r2)
    total_rf_training_mse.append(rf_model_training_mse)


for i in range(iteration):
    print("The Iteration is ", i)
    main_function(query_number, i)


averaged_rf_r2 = np.array([sum(x) for x in zip(* total_rf_r2)])/iteration
averaged_rf_mse = np.array([sum(x) for x in zip(* total_rf_mse)])/iteration
averaged_rf_training_r2 = np.array([sum(x) for x in zip(* total_rf_training_r2)])/iteration
averaged_rf_training_mse = np.array([sum(x) for x in zip(* total_rf_training_mse)])/iteration

file_name_6 = "Res_Diabetes\\Greedy1\\Averaged\\testing_rf_r2_Greedy-10"
file_name_7 = "Res_Diabetes\\Greedy1\\Averaged\\testing_rf_mse_Greedy-10"
file_name_8 = "Res_Diabetes\\Greedy1\\Averaged\\training_rf_mse_Greedy-10"
file_name_9 = "Res_Diabetes\\Greedy1\\Averaged\\training_rf_r2_Greedy-10"

np.save(file=file_name_6+".npy", arr=averaged_rf_r2)
np.save(file=file_name_7+".npy", arr=averaged_rf_mse)
np.save(file=file_name_8+".npy", arr=averaged_rf_training_mse)
np.save(file=file_name_9+".npy", arr=averaged_rf_training_r2)

The Iteration is  0
After Initialization 0.4523804380917551
(10, 10)
Query no. 1
(20, 10)
RF R2: 0.3502864264369876
Query no. 2
(30, 10)
RF R2: 0.4028294061254094
Query no. 3
(40, 10)
RF R2: 0.3321616705927336
Query no. 4
(50, 10)
RF R2: 0.39302697451985114
Query no. 5
(60, 10)
RF R2: 0.39487337617748663
Query no. 6
(70, 10)
RF R2: 0.42912731396653714
Query no. 7
(80, 10)
RF R2: 0.4350084581711733
Query no. 8
(90, 10)
RF R2: 0.46209722187802504
Query no. 9
(100, 10)
RF R2: 0.49524798398800296
Query no. 10
(110, 10)
RF R2: 0.49939217783861123
Query no. 11
(120, 10)
RF R2: 0.4910801637782989
Query no. 12
(130, 10)
RF R2: 0.4907516740834664
Query no. 13
(140, 10)
RF R2: 0.48950139986847696
Query no. 14
(150, 10)
RF R2: 0.49454565739946876
Query no. 15
(160, 10)
RF R2: 0.48774649034058815
Query no. 16
(170, 10)
RF R2: 0.4772166768093522
Query no. 17
(180, 10)
RF R2: 0.5032551527167436
Query no. 18
(190, 10)
RF R2: 0.49101490361758704
Query no. 19
(200, 10)
RF R2: 0.5090249382008823
Query n