In [1]:
from elm import ELM, approximated_ENRELM, incremental_ENRELM
import numpy as np
import os
from sklearn.model_selection import train_test_split
import utils
from matplotlib import pyplot as plt
import time
import logger
import datagenerator
import dataloader
import pandas as pd




# Utils

In [2]:
def looped_ELM(X_train, X_test, y_train, y_test, loops = 20):
    hidden_space_dim = int(np.minimum(50 * X_train.shape[0], round(X_train.shape[1] * 0.5)))
    
    test = {
        'X': X_test,
        'y': y_test
    }

    loops = 20
    loops_train_err_ELM = np.zeros(shape=(loops,hidden_space_dim))
    loops_test_err_ELM = np.zeros(shape=(loops,hidden_space_dim))

    total_timing = 0                                                                            

    for loop in range(loops):
        for n in range(1,hidden_space_dim+1):
            _, _, results = ELM(n, -1, X_train, y_train, test)
            loops_train_err_ELM[loop, n-1] = results['training_error']
            loops_test_err_ELM[loop, n-1] = results['test_error']
            total_timing += results['timing']

                                                                          

    mean_loop_timing = total_timing / loops

    training_error = np.mean(loops_train_err_ELM, axis = 0)
    std_training_error = np.std(loops_train_err_ELM, axis = 0)
    min_training_error = np.min(loops_train_err_ELM, axis = 0)
    max_training_error = np.max(loops_train_err_ELM, axis = 0)

    test_error = np.mean(loops_test_err_ELM, axis = 0)
    std_test_error = np.std(loops_test_err_ELM, axis = 0)
    min_test_error = np.min(loops_test_err_ELM, axis = 0)
    max_test_error = np.max(loops_test_err_ELM, axis = 0)


    data = {
        'training_error': training_error,
        'loops_training_error': loops_train_err_ELM,
        'std_training_error': std_training_error,
        'min_training_error': min_training_error,
        'max_training_error': max_training_error,
        'test_error': test_error,
        'loops_test_error': loops_test_err_ELM,
        'std_test_error': std_test_error,
        'min_test_error': min_test_error,
        'max_test_error': max_test_error,
        'total_timing': total_timing,
        'mean_loop_timing': mean_loop_timing
        }

    return data



def single_dataset_simulation(X,y):
    seed_train_test_split = 1234

    X_train, X_test, y_train, y_test = train_test_split(X.T ,y, random_state = seed_train_test_split)
    X_train, X_test, y_train, y_test, mean_y_train = utils.preprocess(X_train, X_test, y_train, y_test)
    X_train = X_train.T
    X_test = X_test.T

    
    test = {
        'X': X_test,
        'y': y_test
    }

    # approximated ENR-ELM
    _, _, results_approximated_ENRELM = approximated_ENRELM(X_train, y_train, sort_by_correlation=True, test=test)  

    # incremental ENR-ELM
    _, _, _, results_incremental_ENRELM = incremental_ENRELM(X_train, y_train, epsilon = 1/np.sqrt(y_train.shape[0]), threshold=1e-5, test = test)

    #ELM
    results_ELM = looped_ELM(X_train, X_test, y_train, y_test, loops = 20)
    
    results = [results_approximated_ENRELM, results_incremental_ENRELM, results_ELM]
    suffixes = ["_approximated_ENRELM", "_incremental_ENRELM", "_ELM"]

    data = {}
    for result, suffix in zip(results, suffixes):
        for key, value in result.items():
            data[key + suffix] = value


    return data

In [3]:
def write_df_csv(filename, df_times):
    # Check if the file exists
    folder_path = 'results/datasets'
    full_path = os.path.join(folder_path, filename)

    # Create the directories if they don't exist
    os.makedirs(folder_path, exist_ok=True)

    # Save the dataframe to a CSV file
    df_times.to_csv(full_path, sep=";", index=False)
    #if not os.path.isfile(filename):
        # File does not exist, save df_times as is

def write_np_npz(filename, data):
    folder_path = 'results/datasets'
    full_path = os.path.join(folder_path, filename)

    # Create the directories if they don't exist
    os.makedirs(folder_path, exist_ok=True)
    np.savez_compressed(full_path, **data)

# Real Datasets

### Performance

In [10]:
loaders = {
    1: ("Abalone", dataloader.load_abalone),
    2: ("Auto MPG", dataloader.load_auto_mpg),
    3: ("Bank", dataloader.load_bank),
    4: ("California Housing", dataloader.load_california_housing),
    5: ("Delta Ailerons", dataloader.load_delta_ailerons),
    6: ("LA Ozone", dataloader.load_LAozone),
    7: ("Machine CPU", dataloader.load_machine_cpu),
    8: ("Prostate Cancer", dataloader.load_prostate),
    9: ("Servo", dataloader.load_servo)
}

for key, value in loaders.items():
    name = value[0]
    loader = value[1]
    generator = datagenerator.LoaderDataGenerator(name, loader)
    res = generator.generate()
    print(res['name'])
    X = (res['data'][0]).T
    y = res['data'][1].T

    results = single_dataset_simulation(X, y)
    write_np_npz(name + "_results" + ".npz", results)

Abalone
<class 'pandas.core.frame.DataFrame'>
Auto MPG
Bank
California Housing
Delta Ailerons
LA Ozone
Machine CPU
Prostate Cancer
Servo


### Times

In [10]:
iterations = 5
df_times = pd.DataFrame(columns=["dataset", "iteration", "timing_approximated_ENRELM", "timing_incremental_ENRELM", "total_timing_ELM", "mean_loop_timing_ELM"])
loaders = {
    1: ("Abalone", dataloader.load_abalone),
    2: ("Auto MPG", dataloader.load_auto_mpg),
    3: ("Bank", dataloader.load_bank),
    4: ("California Housing", dataloader.load_california_housing),
    5: ("Delta Ailerons", dataloader.load_delta_ailerons),
    6: ("LA Ozone", dataloader.load_LAozone),
    7: ("Machine CPU", dataloader.load_machine_cpu),
    8: ("Prostate Cancer", dataloader.load_prostate),
    9: ("Servo", dataloader.load_servo)
}

dataset_index = 0
for key, value in loaders.items():
    dataset_index += 1
    name = value[0]
    loader = value[1]
    generator = datagenerator.LoaderDataGenerator(name, loader)
    res = generator.generate()
    X = res['data'][0]
    y = res['data'][1]
    print(name)

    # Loop over iterations
    for iteration in range(iterations):
        results = single_dataset_simulation(X, y)
        # Append a new row to the dataframe with the results and dataset name
        df_times = pd.concat([df_times, pd.DataFrame({
            "dataset": [name],
            "iteration": [iteration],
            "timing_approximated_ENRELM": [results["timing_approximated_ENRELM"]],
            "timing_incremental_ENRELM": [results["timing_incremental_ENRELM"]],
            "total_timing_ELM": [results["total_timing_ELM"]],
            "mean_loop_timing_ELM": [results["mean_loop_timing_ELM"]]
        })], ignore_index=True)

    # saving results when computing times is disabled
    #write_np_npz(name + "_results" + ".npz", results)

# Save times
write_df_csv("times_real.csv", df_times)

Abalone


  df_times = pd.concat([df_times, pd.DataFrame({


<class 'pandas.core.frame.DataFrame'>
Auto MPG
Bank
California Housing
Delta Ailerons
LA Ozone
Machine CPU
Prostate Cancer
Servo


# Synthetic datasets

### Performance

In [8]:
for dataset_index in range(1, 49):
    filename = f'dataset_{dataset_index}.csv'
    file_path = os.path.join('datasets/synthetic', filename)
    print(filename)
    # Load the dataset from the CSV file
    dataset = np.loadtxt(file_path, delimiter=',')

    # Split the dataset into X and y
    X = dataset[:-1, :]
    y = (dataset[-1, :]).reshape(-1,1)
    
    # Run simulation
    results = single_dataset_simulation(X, y)
    write_np_npz(filename[:-4] + "_results" + ".npz", results)


dataset_1.csv
dataset_2.csv
dataset_3.csv
dataset_4.csv
dataset_5.csv
dataset_6.csv
dataset_7.csv
dataset_8.csv
dataset_9.csv
dataset_10.csv
dataset_11.csv
dataset_12.csv
dataset_13.csv
dataset_14.csv
dataset_15.csv
dataset_16.csv
dataset_17.csv
dataset_18.csv
dataset_19.csv
dataset_20.csv
dataset_21.csv
dataset_22.csv
dataset_23.csv
dataset_24.csv
dataset_25.csv
dataset_26.csv
dataset_27.csv
dataset_28.csv
dataset_29.csv
dataset_30.csv
dataset_31.csv
dataset_32.csv
dataset_33.csv
dataset_34.csv
dataset_35.csv
dataset_36.csv
dataset_37.csv
dataset_38.csv
dataset_39.csv
dataset_40.csv
dataset_41.csv
dataset_42.csv
dataset_43.csv
dataset_44.csv
dataset_45.csv
dataset_46.csv
dataset_47.csv
dataset_48.csv


### Times

In [4]:
iterations = 10
df_times = pd.DataFrame(columns=["dataset", "iteration", "timing_approximated_ENRELM", "timing_incremental_ENRELM", "total_timing_ELM", "mean_loop_timing_ELM"])
first_idx = 1
last_idx = 48
for dataset_index in range(first_idx, last_idx+1):
    filename = f'dataset_{dataset_index}.csv'
    file_path = os.path.join('datasets/synthetic', filename)
    # Load the dataset from the CSV file
    dataset = np.loadtxt(file_path, delimiter=',')
    print(filename[:-4])
        
    # Split the dataset into X and y
    X = dataset[:, :-1]
    y = (dataset[:, -1]).reshape(-1,1)
    for iteration in range(iterations):
        results = single_dataset_simulation(X, y)
            # Append a new row to the dataframe with the results and dataset name
        df_times = pd.concat([df_times, pd.DataFrame({
            "dataset": [f'dataset_{dataset_index}'],
            "iteration": [iteration],
            "timing_approximated_ENRELM": [results["timing_approximated_ENRELM"]],
            "timing_incremental_ENRELM": [results["timing_incremental_ENRELM"]],
            "total_timing_ELM": [results["total_timing_ELM"]],
            "mean_loop_timing_ELM": [results["mean_loop_timing_ELM"]]
        })], ignore_index=True)
    # saving results when computing times is disabled
    #write_np_npz(filename[:-4] + "_results" + ".npz", results)

# Save times
write_df_csv("times_synthetic" + str(first_idx) + "_" + str(last_idx)+ ".csv", df_times)

dataset_1


  df_times = pd.concat([df_times, pd.DataFrame({


dataset_2
dataset_3
dataset_4
dataset_5
dataset_6
dataset_7
dataset_8
dataset_9
dataset_10
dataset_11
dataset_12
dataset_13
dataset_14
dataset_15
dataset_16
dataset_17
dataset_18
dataset_19
dataset_20
dataset_21
dataset_22
dataset_23
dataset_24
dataset_25
dataset_26
dataset_27
dataset_28
dataset_29
dataset_30
dataset_31
dataset_32
dataset_33
dataset_34
dataset_35
dataset_36
dataset_37
dataset_38
dataset_39
dataset_40
dataset_41
dataset_42
dataset_43
dataset_44
dataset_45
dataset_46
dataset_47
dataset_48
