In [2]:
## BASIC PLOTTING TEST

from data_generators import DataGenerator, SmoothDataGenerator, JumpDataGenerator, SinusoidalDataGenerator
from kernel_har import KernelHAR
from highly_adaptive_lasso import HAL
from highly_adaptive_ridge import HAR
import numpy as np
import pandas as pd 
from run_trials import RunTrials
import warnings
from train_time_plotter import TrainTimePlotter

# Suppress warnings
warnings.filterwarnings("ignore")

d = 1

# Create a list of sample sizes at regular intervals 
sample_sizes = np.arange(start=100, stop=1000, step=100)

# Number of trials to run for each sample size, dgp, model
num_trials = 3

# Create a data generator
dgp = SmoothDataGenerator()

results = RunTrials.run_trials(d, sample_sizes, num_trials, dgp)

# Convert results to DataFrame
df = pd.DataFrame(results)

In [3]:
from train_time_plotter import TrainTimePlotter

# Plot the training time
TrainTimePlotter.plot(df, d, dgp.name)

In [1]:
## Run trials for all combinations of d sizes, sample sizes, and data generators

# IMPORTS REQUIRED FOR TRAINING, PLOTTING, AND SAVING DATAFRAMES
import numpy as np
from data_generators import DataGenerator, SmoothDataGenerator, JumpDataGenerator, SinusoidalDataGenerator
import pandas as pd 
from run_trials import RunTrials
import warnings
from train_time_plotter import TrainTimePlotter
import os
import pickle

# Suppress warnings
warnings.filterwarnings("ignore")

## Setup: Define the parameters for the experiment

ALL_DF_FILE_NAMES = "Training_df_files/all_file_names.pickle"

d_sizes = [1, 3, 5]     # Dimensionality of the data (same as HAL paper)
num_trials = 5          # Number of trials to run for each sample size, dgp, model
sample_sizes = np.arange(start=100, stop=1000, step=100) 
data_generators = [SmoothDataGenerator, JumpDataGenerator, SinusoidalDataGenerator]
data_frames = [] # List to store all DataFrames for each combination of d, sample size, and data generator
all_plots = [] # List to store all plots (might as well!)
all_file_names = [] # List to store all file names for the saved DataFrames

# Run trials for all combinations of d sizes, sample sizes, and data generators
for d in d_sizes:
    for dgp in data_generators:

        # Run trials for the current combination of d, sample size, and data generator
        results = RunTrials.run_trials(d, sample_sizes, num_trials, dgp)

        # Convert results to DataFrame and append to the list of all DataFrames
        df = pd.DataFrame(results)
        data_frames.append(df)

        # Generate a descriptive file name, save to pickle format, and append to the list of all file names
        file_name = f"Training_df_files/dataframe_d{d}_dgp_{dgp.name}.pickle"
        df.to_pickle(file_name)
        all_file_names.append(file_name)
        print(f"Saved DataFrame to {file_name}")

        # Append the plot to the list of all plots, and display the plot
        plot = TrainTimePlotter.plot(df, d, dgp.name)
        all_plots.append(plot)
        # display(plot)
        
# Save the file names to a pickle file
with open(ALL_DF_FILE_NAMES, "wb") as f:
    pickle.dump(all_file_names, f)

Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 1.0
Best lambda: 10.0
Best lambda: 10.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 10.0
Best lambda: 1.0
Best lambda: 10.0
Best lambda: 100.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 1.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 10.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 10.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 10.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 100.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 100.0
Best lambda: 1000.0
Best lambda: 1000.0
Saved DataFrame to Training_df_files/dataframe_d1_dgp_Smooth.pickle
Best lambda: 1000.0
Best lambda: 1000.0
Best lambda: 10.0
Best lambda

In [7]:
## GRID OF PLOTS 1: TRAIN TIME VS SAMPLE SIZE FOR ALL DGPS AND D SIZES
import pandas as pd
import altair as alt
import pickle

ALL_DF_FILE_NAMES = "Training_df_files/all_file_names.pickle"

reshaped_data = []
d_sizes = [1, 3, 5] 
dgp_types = ['Smooth', 'Jump', 'Sinusoidal']  # The order of DGP types for each 'd'

# --------------------------------------------------------------------------------- # 
# if data_frames is not defined (new kernel), load file names from ALL_DF_FILE_NAMES
# --------------------------------------------------------------------------------- # 
if 'data_frames' not in locals():
    # Load the file names from the pickle file
    with open(ALL_DF_FILE_NAMES, "rb") as f:
        all_file_names = pickle.load(f)
    # Load the dataframes from the pickle files
    data_frames = [pd.read_pickle(file_name) for file_name in all_file_names]
# --------------------------------------------------------------------------------- # 

## RESHAPING DATAFRAMES FOR PLOTTING
# Assuming data_frames is a list of 9 dataframes in the order mentioned
for i, data in enumerate(data_frames):
    # Calculate the index for d_sizes and dgp_types
    d_index = i // len(dgp_types)
    dgp_index = i % len(dgp_types)

    # Aggregate results by sample size and method
    aggregated_df = data.groupby(['Sample Size', 'Method']).agg(
        mean_training_time=pd.NamedAgg(column='training_time', aggfunc='mean'),
        std_training_time=pd.NamedAgg(column='training_time', aggfunc='std')
    ).reset_index()

    # Add 'd' and 'dgp' columns
    aggregated_df['d'] = d_sizes[d_index]
    aggregated_df['dgp_type'] = dgp_types[dgp_index]

    reshaped_data.append(aggregated_df)

# Function to create a line plot from an aggregated DataFrame
def create_plot_from_df(df, d, dgp_type):
    """Generate a line plot from an aggregated DataFrame."""
    line_chart = alt.Chart(df).mark_line(point=True).encode(
        x='Sample Size:Q',
        y=alt.Y('mean_training_time:Q', title=f"Mean Training Time (s), d={d}"),
        color='Method:N',
        tooltip=['Sample Size', 'Method', 'mean_training_time', 'std_training_time']
    ).properties(
        title=f"DGP: {dgp_type}",
        width=400,
        height=200
    )
    return line_chart

# Function to arrange plots in a grid
def arrange_plots_in_grid(plots, num_cols=3, num_rows=3):
    # Create rows of charts
    rows = [alt.hconcat(*plots[i:i+num_cols]) for i in range(0, len(plots), num_cols)]
    # Combine rows into a single chart
    grid = alt.vconcat(*rows)
    return grid

# Generate all individual line plots, now using the 'd' and 'dgp_type' directly from the dataframes
line_plots = [create_plot_from_df(df, df['d'].iloc[0], df['dgp_type'].iloc[0]) for df in reshaped_data]

# Arrange the line plots into a 3x3 grid
line_grid_chart = arrange_plots_in_grid(line_plots, num_cols=3)

# Display the grid chart
line_grid_chart.display()

In [8]:
import pandas as pd
import altair as alt

# Assuming data_frames is a list of 9 dataframes in the order mentioned
reshaped_data_mse = []
for i, data in enumerate(data_frames):
    # Calculate the index for d_sizes and dgp_types
    d_index = i // len(dgp_types)
    dgp_index = i % len(dgp_types)

    # Aggregate results by sample size and method for MSE
    aggregated_df_mse = data.groupby(['Sample Size', 'Method']).agg(
        mean_mse=pd.NamedAgg(column='MSE', aggfunc='mean'),
        std_mse=pd.NamedAgg(column='MSE', aggfunc='std')
    ).reset_index()

    # Add 'd' and 'dgp' columns
    aggregated_df_mse['d'] = d_sizes[d_index]
    aggregated_df_mse['dgp_type'] = dgp_types[dgp_index]

    reshaped_data_mse.append(aggregated_df_mse)

def create_mse_plot_from_df(df, d, dgp_type):
    """Generate a line plot from an aggregated DataFrame for MSE."""
    mse_chart = alt.Chart(df).mark_line(point=True).encode(
        x='Sample Size:Q',
        y=alt.Y('mean_mse:Q', title=f"Mean MSE, d={d}"),
        color='Method:N',
        tooltip=['Sample Size', 'Method', 'mean_mse', 'std_mse']
    ).properties(
        title=f"DGP: {dgp_type}",
        width=400,
        height=200
    )
    return mse_chart

def arrange_plots_in_grid(plots, num_cols=3):
    rows = [alt.hconcat(*plots[i:i+num_cols]) for i in range(0, len(plots), num_cols)]
    grid = alt.vconcat(*rows).configure_title(
        color='white'
    ).configure_axis(
        # gridColor='white',
        # titleColor='white',
        # labelColor='white',
        # domainColor='white',
        # tickColor='white',
        gridWidth=0.5
    ).configure_legend(
        # labelColor='white',
        # titleColor='white',
        labelFontSize=14,
        titleFontSize=16
    ).properties(
        # background='black'
    )
    return grid

# Generate all individual MSE line plots
mse_line_plots = [create_mse_plot_from_df(df, df['d'].iloc[0], df['dgp_type'].iloc[0]) for df in reshaped_data_mse]

# Arrange the MSE line plots into a 3x3 grid
mse_line_grid_chart = arrange_plots_in_grid(mse_line_plots, num_cols=3)

# Display the grid chart for MSE
mse_line_grid_chart.display()


In [1]:
# import all methods again
from highly_adaptive_lasso import HAL
from highly_adaptive_ridge import HAR
from kernel_har import KernelHAR

import numpy as np
import pandas as pd 
from run_trials import RunTrials
import warnings
from train_time_plotter import TrainTimePlotter

# Suppress warnings
warnings.filterwarnings("ignore")

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from data_generators import SmoothDataGenerator, JumpDataGenerator, SinusoidalDataGenerator

n = 1000
d = 3
X, Y = SmoothDataGenerator.generate_data(n, d)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Train and record train time
method = HAR()
# fit the model and record training time
import time
start_time = time.time()
method.fit(X_train, Y_train)
training_time = time.time() - start_time

# print the training time for method
print(f"Training time for HAR: {training_time}")



Training time for HAR: 0.5682401657104492


In [1]:
# import all methods again
from highly_adaptive_lasso import HAL
from highly_adaptive_ridge import HAR
from kernel_har import KernelHAR
import numpy as np
import pandas as pd 
from run_trials import RunTrials
import warnings
from train_time_plotter import TrainTimePlotter
from sklearn.model_selection import train_test_split

# try testing HAR on some real data
# Load the data and split into X and Y
data = pd.read_csv("/Users/alexhagemeister/Downloads/csv/kin8nm.csv")

# Check for missing values
if data.isnull().values.any():
    print("Data contains missing values. Please handle them before training.")
    data = data.dropna()

# Ensure all data are numeric
if not all(np.issubdtype(dtype, np.number) for dtype in data.dtypes):
    print("Data contains non-numeric values. Please convert them to numeric before training.")

# Last column is the target
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Train and record train time
method = KernelHAR()
# fit the model and record training time
import time
start_time = time.time()
method.fit(X_train, Y_train)
training_time = time.time() - start_time

# print the training time for method
print(f"Training time for HAR: {training_time}")

#---- PLOTTING ----#

print("lambdas: ", method.lambdas)
print("CV MSEs: ", method.cv_mses)

# plot cv mse as function of lambda using altair
import altair as alt
import pandas as pd

# Create a DataFrame from the cross-validated MSEs
df = pd.DataFrame({'Lambda': method.lambdas, 'CV MSE': method.cv_mses})

# Create a line plot of CV MSE as a function of lambda
line_chart = alt.Chart(df).mark_line(point=True).encode(
    x='Lambda:Q',
    y='CV MSE:Q'
).properties(
    title='Cross-Validated MSE as a Function of Lambda',
    width=400,
    height=200
)

# Display the line chart
line_chart.display()

computing kernel matrix for K_train for fold:  [   0    1    2 ... 6550 6551 6552]


: 

In [None]:
print("lambdas: ", method.lambdas)
print("CV MSEs: ", method.cv_mses)

# plot cv mse as function of lambda using altair
import altair as alt
import pandas as pd

# Create a DataFrame from the cross-validated MSEs
df = pd.DataFrame({'Lambda': method.lambdas, 'CV MSE': method.cv_mses})

# Create a line plot of CV MSE as a function of lambda
line_chart = alt.Chart(df).mark_line(point=True).encode(
    x='Lambda:Q',
    y='CV MSE:Q'
).properties(
    title='Cross-Validated MSE as a Function of Lambda',
    width=400,
    height=200
)

# Display the line chart
line_chart.display()

lambdas:  [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
CV MSEs:  [ 1.31914686  1.32847902  1.4203525   2.18205082  6.26085194 22.0048786 ]


In [12]:
np.max(method.kernel_matrix)

11040

In [20]:
# import all methods again
from highly_adaptive_lasso import HAL
from highly_adaptive_ridge import HAR
from kernel_har import KernelHAR
import numpy as np
import pandas as pd 
from run_trials import RunTrials
import warnings
from train_time_plotter import TrainTimePlotter
from sklearn.model_selection import train_test_split


data = pd.read_csv("/Users/alexhagemeister/Downloads/csv/kin8nm.csv")

# Last column is the target
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

# get the first 200 samples

X = X[:200]
Y = Y[:200]

# split into training (X) and validation (X') sets
X_train, X_prime, Y_train, Y_prime = train_test_split(X, Y, test_size=0.2, random_state=42)

har = HAR()
har.knots = X_train
basis_matrix = har._bases(X_train)

K_har = basis_matrix @ basis_matrix.T

khar = KernelHAR()
khar.knots = X_train
K_khar = khar._compute_kernel_matrix(X_train, X_train)

# check that K_har and K_khar are the same            
K_har - K_khar

built comparison matrix


array([[-3073, -1246,  -926, ..., -1285, -1539, -1121],
       [-1246, -2448, -1157, ...,  -860, -1567, -1047],
       [ -926, -1157, -2371, ...,  -723, -1643,  -763],
       ...,
       [-1285,  -860,  -723, ..., -3791, -1038, -1755],
       [-1539, -1567, -1643, ..., -1038, -6205, -1782],
       [-1121, -1047,  -763, ..., -1755, -1782, -3441]])

In [21]:
one_way_bases = np.stack([
    np.less.outer(X[:,j], X_prime[:,j])
    for j in range(X.shape[1])
])
one_way_bases.shape

(8, 200, 40)

In [22]:
X.shape, X_prime.shape

((200, 8), (40, 8))

In [23]:
basis_matrix.shape

(160, 40800)

In [33]:
X.shape[0]* (2**(X.shape[1])-1)

51000

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler
# from tabulate import tabulate
import time

# Sample data (replace with actual data)
data = {
    'data': ['yacht', 'energy', 'boston', 'concrete', 'wine', 'power', 'kin8nm', 'naval', 'protein', 'blog', 'slice', 'yearmsd'],
    'p': [6, 8, 13, 8, 11, 4, 8, 17, 9, 280, 384, 90],
    'n': [308, 768, 506, 1030, 1599, 9568, 8192, 11934, 45730, 52397, 53500, 515345],
    'LTB': ['0.90 (10.69)', '0.40 (30.82)', '3.35 (5.92)', '4.70 (43.29)', '0.64 (6.01)', '3.41 (56.92)', '0.12 (96.50)', '0.00 (107.98)', '1.94 (611.38)', '23.49 (185.49)', '1.23 (3350.61)', '8.54 (4616.82)'],
    'GBT': ['0.90 (4.68)', '0.40 (21.46)', '3.43 (4.47)', '4.87 (37.15)', '0.63 (3.58)', '3.46 (28.88)', '0.10 (60.40)', '0.00 (56.19)', '1.94 (96.80)', '23.46 (9.90)', '1.24 (3067.95)', '8.54 (1543.05)'],
    'HAL': ['0.72 (0.92)', '0.43 (45.80)', '3.66 (916.61)', '4.02 (134.01)', '–', '–', '–', '–', '–', '–', '–', '–'],
    'LASSO': ['8.92 (0.01)', '4.14 (0.01)', '5.02 (0.01)', '10.40 (0.01)', '0.67 (0.03)', '4.59 (0.04)', '0.21 (0.03)', '0.01 (10.51)', '2.50 (0.33)', '28.25 (13.51)', '8.33 (121.71)', '9.49 (40.09)'],
    'HAR': ['–'] * 12,  # Initialize with '-'
    'KernelHAR': ['–'] * 12  # Initialize with '-'
}

# Create DataFrame
table_df = pd.DataFrame(data)

# pickle the dataframe as 'models_compare_table.pickle'
table_file_name = "models_compare_table.pickle"
df.to_pickle(table_file_name)

# Display the DataFrame
table_df

Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),–,–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),–,–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),–,–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),–,–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),–,–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import display

def update_results(df, dataset_name, method_name, rmse, training_time):
    index = df[df['data'] == dataset_name].index[0]
    df.at[index, method_name] = f"{rmse:.2f} ({training_time:.2f})"

# Sample function to simulate training and testing
def train_and_evaluate(method, X_train, Y_train, X_test, Y_test):
    start_time = time.time()
    method.fit(X_train, Y_train)
    training_time = time.time() - start_time
    predictions = method.predict(X_test)
    mse = mean_squared_error(Y_test, predictions)
    rmse = np.sqrt(mse)
    return rmse, training_time

# funtion to read data from csv file and return X and Y
def read_data(file_path):
    data = pd.read_csv(file_path)
    # Check for missing values
    if data.isnull().values.any():
        print("Data contains missing values. Please handle them before training.")
        data = data.dropna()

    # Ensure all data are numeric
    if not all(np.issubdtype(dtype, np.number) for dtype in data.dtypes):
        print("Data contains non-numeric values. Please convert them to numeric before training.")

    # Last column is the target
    X = data.iloc[:, :-1].values
    Y = data.iloc[:, -1].values
    return X, Y

def run_real_data_trials(dataset_name, method, num_trials=1):
    """
    Run trials on a real dataset using the specified method.
    PARAM: 
        dataset_name: str - name of the dataset 
        method: object - the method to use
    RETURN:
        average_rmse: float - the average RMSE over all trials
        average_training_time: float - the average training time over all trials
    """

    # Load the data and split into X and Y
    X, Y = read_data(f"/Users/alexhagemeister/Downloads/csv/{dataset_name}.csv")
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Run trials
    rmses = []
    training_times = []

    for _ in range(num_trials):
        rmse, training_time = train_and_evaluate(method, X_train, Y_train, X_test, Y_test)
        rmses.append(rmse)
        training_times.append(training_time)

    # Calculate the average RMSE and training time
    average_rmse = np.mean(rmses)
    average_training_time = np.mean(training_times)

    return average_rmse, average_training_time



In [6]:
from highly_adaptive_lasso import HAL
from highly_adaptive_ridge import HAR
from kernel_har import KernelHAR
import numpy as np
import pandas as pd
from IPython.display import display
import time
import multiprocessing
import os

table_file_name = "models_compare_table.pickle"
table_df = pd.read_pickle(table_file_name)

for dataset_name in table_df['data']:
    method = HAR()
    rmse, training_time = run_real_data_trials(dataset_name, method)
    update_results(table_df, dataset_name, method.name, rmse, training_time)

    # Save the updated DataFrame
    table_df.to_pickle(table_file_name)

    # Display the updated DataFrame
    display(table_df)



Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),–,–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),–,–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),–,–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),–,–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),–,–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),–,–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),–,–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),–,–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),–,–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),–,–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),–,–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),–
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


: 

In [4]:
from kernel_har import KernelHAR
import numpy as np
import pandas as pd
from IPython.display import display
import time
import multiprocessing
import os

table_file_name = "models_compare_table.pickle"
table_df = pd.read_pickle(table_file_name)

for dataset_name in table_df['data']:
    method = KernelHAR()
    rmse, training_time = run_real_data_trials(dataset_name, method)
    update_results(table_df, dataset_name, method.name, rmse, training_time)

    # Save the updated DataFrame
    table_df.to_pickle(table_file_name)

    # Display the updated DataFrame
    display(table_df)

Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),0.30 (9.80)
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),–
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),0.30 (9.80)
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),0.39 (30.21)
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),–
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),0.30 (9.80)
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),0.39 (30.21)
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),3.39 (15.92)
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),–
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),0.30 (9.80)
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),0.39 (30.21)
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),3.39 (15.92)
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),3.88 (70.02)
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),–
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


Unnamed: 0,data,p,n,LTB,GBT,HAL,LASSO,HAR,KernelHAR
0,yacht,6,308,0.90 (10.69),0.90 (4.68),0.72 (0.92),8.92 (0.01),0.42 (0.49),0.30 (9.80)
1,energy,8,768,0.40 (30.82),0.40 (21.46),0.43 (45.80),4.14 (0.01),0.43 (1.86),0.39 (30.21)
2,boston,13,506,3.35 (5.92),3.43 (4.47),3.66 (916.61),5.02 (0.01),3.79 (49.22),3.39 (15.92)
3,concrete,8,1030,4.70 (43.29),4.87 (37.15),4.02 (134.01),10.40 (0.01),3.96 (3.69),3.88 (70.02)
4,wine,11,1599,0.64 (6.01),0.63 (3.58),–,0.67 (0.03),0.59 (107.46),0.60 (520.56)
5,power,4,9568,3.41 (56.92),3.46 (28.88),–,4.59 (0.04),3.34 (83.11),–
6,kin8nm,8,8192,0.12 (96.50),0.10 (60.40),–,0.21 (0.03),–,–
7,naval,17,11934,0.00 (107.98),0.00 (56.19),–,0.01 (10.51),–,–
8,protein,9,45730,1.94 (611.38),1.94 (96.80),–,2.50 (0.33),–,–
9,blog,280,52397,23.49 (185.49),23.46 (9.90),–,28.25 (13.51),–,–


: 