In [3]:
def load_data_by_dimension(dimension):
    root = "./datasets/"
    data_filename = root + f'kryptonite-{dimension}-X.npy'
    labels_filename = root + f'kryptonite-{dimension}-y.npy'

    try:
        data = np.load(data_filename)
        labels = np.load(labels_filename)
        print(f"Loaded dataset with dimension {dimension}. Data shape: {data.shape}, Labels shape: {labels.shape}")
        return data, labels
    except FileNotFoundError:
        print(f"Dataset for dimension {dimension} not found.")
        return None, None

In [None]:
# feature aggregation and grouping.
# Grouping Criteria: variance
# Aggregation Function: mean
import numpy as np
def load_aggregated_data(dimension, threshold): 
    data, labels = load_data_by_dimension(dimension)
    if data is not None:
        v = np.var(data, axis=1)
        index = np.argsort(v)
        last = v[index[0]]
        bins = [[index[0]]]
        for i in range(1, len(index)):
            if v[index[i]]-last >= threshold:
                bins.append([index[i]])
                last = v[index[i]]
            else:
                bins[-1].append(index[i])
        n = []
        for b in bins:
            temp = []
            for column_index in b:
                temp.append(data[:,column_index])
            temp = np.array(temp)
            temp = np.mean(temp, axis=1)
            n.append(temp)
        n = np.array(n)
        data = n.transpose()
    return data, labels

In [None]:
import time
from datetime import datetime
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

cs = {
    "layer_width": tune.randint(32, 256),
    "layer_depth": tune.randint(3, 6),
    "activation": tune.choice(['relu','tanh','logistic']),
    "lr": tune.loguniform(1e-4, 1e-2), "momentum": tune.uniform(0.1, 0.9),
    "preprocess": tune.choice(['base','aggregated']),
    "grouping_threshold": tune.uniform(),
# activation="relu",
# solver = "adam",
# alpha = 0.0001,
# batch_size = "auto",
# learning_rate = "constant",
# learning_rate_init = 0.001,
# power_t = 0.5,
# max_iter = 200,
# shuffle = True,
# random_state = None,
# tol = 1e-4,
# verbose = False,
# warm_start = False,
# momentum = 0.9,
# nesterovs_momentum = True,
# early_stopping = False,
# validation_fraction = 0.1,
# beta_1 = 0.9,
# beta_2 = 0.999,
# epsilon = 1e-8,
# n_iter_no_change = 10,
# max_fun = 15000,
}

def objective(config):
    global dim
    if config["preprocess"] == "aggregated":
        data, labels = load_aggregated_data(dim, )
    else:
        data, labels = load_data_by_dimension(dim)
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    layers = [config["width"]*config["depth"]]
    model = MLPClassifier(hidden_layer_sizes=layers, max_iter=1000, activation=config["activation"], learning_rate_init=config["lr"], momentum=config["momentum"])
    while True:
        model.fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        train.report({"mean_accuracy": acc})  # Report to Tune

In [None]:
# hyperparameter search
def search():
    global dim
    algo = OptunaSearch()
    tuner = tune.Tuner(
        objective,
        tune_config=tune.TuneConfig(
            metric="mean_accuracy",
            mode="max",
            search_alg=algo,
            num_samples=-1,
            time_budget_s=60 * 2,
            trial_dirname_creator=lambda x: datetime.now().strftime("%H_%M_%S")
        ),
        run_config=train.RunConfig(
            storage_path="./results",
            name="simpleneuralnetwork",
            verbose=0,
        ),
        param_space=cs,
    )
    begin = time.time()
    results = tuner.fit()
    end = time.time() - begin
    best_result = results.get_best_result(metric="mean_accuracy", mode="max")
    best_config = best_result.config
    print("auto optimization finished")
    print('time for optimisation (seconds):' + str(end))
    print("Best trial config: {}".format(best_config))
    print("Best accuracy: {}".format(best_result.metrics["mean_accuracy"]))
    return best_config

In [None]:
import numpy as np
import matplotlib.pyplot as plt

dimensions = [9, 12, 15, 18, 24, 30, 45]
best_ks = []
accuracies = []

for i in range(6):
    global dim
    dim = dimensions[i]
    best = search()
    data, labels = load_data_by_dimension(dim)
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    layers = [best["width"]*best["depth"]]
    model = MLPClassifier(hidden_layer_sizes=layers, max_iter=1000, activation=best["activation"],learning_rate_init=best["lr"], momentum=best["momentum"])
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    accuracies.append(accuracy)

# Plot the performance of models with their best k
plt.figure(figsize=(10, 5))
plt.plot(dimensions, accuracies, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Dimensions')
plt.ylabel('Accuracy')
plt.title('kNN Performance vs. Dimensionality')
plt.grid(True)
plt.show()