
# Progressive MLP Growth Experiment

This notebook reuses the `test_mlp.py` training utilities to explore a growing MLP. We start with a single hidden layer of eight neurons, train on 500-step DA-MH windows, and use the subsequent 500 samples for validation-driven architecture growth.



## Notebook Outline
1. Configure paths, hyperparameters, activation, and growth heuristics.
2. Load datasets and helpers from `test_mlp.py`.
3. Train the base model on the current chunk and record the validation L1 log-likelihood error.
4. Enumerate all enlargements (doubling or inserting layers), retrain each, and log their validation errors.
5. Display a results table every search step and accept the best-performing architecture (only if it improves by at least 5%).
6. Repeat and log progress to CSV, including a "master" validation metric computed on the chain tail beyond step 50,000.


In [1]:

import math
import numpy as np
import torch
import pandas as pd

from copy import deepcopy

from test_mlp import (
    MLP,
    load_data,
    collect_training_indices,
    standardize_features,
    train_mlp,
    log_posterior_unnorm_numpy,
    unique_preserve_order,
)


## Configuration

In [2]:
# Variant notes (diffs vs this base config):
# Base notebook logs results to mlp_growth_progress6.csv.
# - mlp_growth_experiment copy: extends ADAM/FINE_TUNE epochs to 2000, LBFGS steps to 50, writes mlp_growth_progress4.csv.
# - mlp_growth_experiment2: lowers initial/fine-tune learning rates to 1e-3, records mlp_growth_progress7.csv.
# - mlp_growth_experiment3: uses lr 1e-2, 1000 ADAM epochs, 100 LBFGS steps, 2% candidate threshold, outputs mlp_growth_progress10.csv.
# - mlp_growth_experiment4: same as experiment3 but trains with MSE loss, saves mlp_growth_progress11.csv.
# - mlp_growth_experiment5: shrinks train/val chunks to 500 samples with experiment3 schedule and 2% gate, logs mlp_growth_progress12.csv.
# - mlp_growth_experiment6: matches experiment5 but with lr 1e-3 and no reinitialization, logging mlp_growth_progress13.csv.
# - mlp_growth_experiment7: 500-sample chunks, lr 1e-2, ADAM patience 20, batch growth 2x, 2% loop tolerance, 10% gate, no reinit, writes mlp_growth_progress14.csv.
# - mlp_growth_experiment_new: same windows with lr 1e-3, quieter logging (verbose=1) and higher growth noise (3e-1), saving mlp_growth_progress_new.csv.


DATA_PATH = 'data1.h5'
SIGMA_PRIOR = 1.0
SIGMA_LIK = 0.3

TRAIN_CHUNK = 2000
VAL_CHUNK = 2000
MAX_CHAIN_USED = 50000
MASTER_VAL_START = 50000  # tail region for global validation metrics

USE_STANDARDIZATION = False
ACTIVATION = 'tanh'

ADAM_LR = 1e-1
ADAM_EPOCHS = 500
ADAM_PATIENCE = 100
LBFGS_STEPS = 20
TRAIN_LOOPS = 80
TRAIN_LOSS = 'l1'
LOSS_DOMAIN = 'obs'
BATCH_SIZE = 32
BATCH_GROWTH = 1.2
TRAIN_VERBOSE = 2
LOOP_IMPROVEMENT_PCT = 0.1

FINE_TUNE_LR = 1e-1
FINE_TUNE_ADAM_EPOCHS = 500
FINE_TUNE_ADAM_PATIENCE = 100
FINE_TUNE_LBFGS_STEPS = 20
FINE_TUNE_LOOPS = 80

GROWTH_NOISE_STD = 1e-1
CANDIDATE_IMPROVEMENT_PCT = 5.0
IMPROVEMENT_TOL = 1e-5
RESULTS_CSV = 'mlp_growth_progress6.csv'

SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

REINIT_BEFORE_CHUNK = True
REINIT_BEFORE_CANDIDATE = True


## Helper Functions

In [3]:

def linear_layers(model: MLP):
    return [m for m in model.net if isinstance(m, torch.nn.Linear)]


def clone_with_hidden(hidden_sizes):
    return MLP(input_dim, hidden_sizes, output_dim, activation=ACTIVATION)


def copy_linear_weights(dst: MLP, src: MLP, skip_new_idx: int | None = None) -> None:
    new_layers = linear_layers(dst)
    old_layers = linear_layers(src)
    old_idx = 0
    with torch.no_grad():
        for new_idx, new_layer in enumerate(new_layers):
            if skip_new_idx is not None and new_idx == skip_new_idx:
                continue
            if old_idx >= len(old_layers):
                break
            old_layer = old_layers[old_idx]
            rows = min(new_layer.out_features, old_layer.out_features)
            cols = min(new_layer.in_features, old_layer.in_features)
            new_layer.weight[:rows, :cols] = old_layer.weight[:rows, :cols]
            new_layer.bias[:rows] = old_layer.bias[:rows]
            old_idx += 1


def duplicate_units(dst: MLP, src: MLP, idx: int, noise_std: float) -> None:
    dst_lin = linear_layers(dst)
    src_lin = linear_layers(src)
    with torch.no_grad():
        src_layer = src_lin[idx]
        dst_layer = dst_lin[idx]
        orig_units = src_layer.out_features
        dst_layer.weight[:orig_units, :src_layer.in_features] = src_layer.weight
        dst_layer.bias[:orig_units] = src_layer.bias
        dst_layer.weight[orig_units:2 * orig_units, :src_layer.in_features] = src_layer.weight
        dst_layer.bias[orig_units:2 * orig_units] = src_layer.bias
        dst_layer.weight[orig_units:, :] += noise_std * torch.randn_like(dst_layer.weight[orig_units:, :])
        dst_layer.bias[orig_units:] += noise_std * torch.randn_like(dst_layer.bias[orig_units:])
        if idx + 1 < len(dst_lin):
            next_dst = dst_lin[idx + 1]
            next_src = src_lin[idx + 1]
            next_dst.weight[:, :orig_units] = next_src.weight / 2.0
            next_dst.weight[:, orig_units:2 * orig_units] = next_src.weight / 2.0
            next_dst.bias.copy_(next_src.bias)


def insert_identity(dst: MLP, width: int, idx: int, activation: str, noise_std: float) -> None:
    dst_layer = linear_layers(dst)[idx]
    with torch.no_grad():
        dst_layer.weight.zero_()
        dst_layer.bias.zero_()
        diag = min(width, dst_layer.out_features, dst_layer.in_features)
        scale = 1.0
        if activation == 'tanh':
            scale = 1.0 / math.tanh(1.0)
        dst_layer.weight[:diag, :diag] = scale * torch.eye(diag)
        dst_layer.weight[:diag, :diag] += noise_std * torch.randn_like(dst_layer.weight[:diag, :diag])


def build_candidate(src_model: MLP, hidden_sizes: list[int], op: str, idx: int) -> tuple[list[int], MLP]:
    if op == 'double':
        new_hidden = hidden_sizes.copy()
        new_hidden[idx] *= 2
        skip_idx = None
        width = None
    else:
        if idx == 0:
            width = hidden_sizes[0]
        elif idx >= len(hidden_sizes):
            width = hidden_sizes[-1]
        else:
            width = hidden_sizes[idx - 1]
        new_hidden = hidden_sizes.copy()
        new_hidden.insert(idx, width)
        skip_idx = idx
    candidate = clone_with_hidden(new_hidden)
    copy_linear_weights(candidate, src_model, skip_new_idx=skip_idx)
    if op == 'double':
        duplicate_units(candidate, src_model, idx, GROWTH_NOISE_STD)
    else:
        insert_identity(candidate, width, idx, ACTIVATION, GROWTH_NOISE_STD)
    return new_hidden, candidate


def logpi_l1_error(model: MLP, start: int, length: int, x_mean, x_std) -> float:
    if length <= 0:
        return float('nan')
    idx_curr = chain[start:start + length]
    idx_props = props[start:start + length]
    unique_idx = unique_preserve_order(np.concatenate([idx_curr, idx_props]))
    par_block = par[unique_idx]
    if USE_STANDARDIZATION:
        X_block = (par_block - x_mean) / x_std
    else:
        X_block = par_block
    with torch.no_grad():
        preds = model(torch.from_numpy(X_block.astype(np.float32)).to(DEVICE)).cpu().numpy()
    logpi_pred = log_posterior_unnorm_numpy(par_block, preds, y_obs, SIGMA_PRIOR, SIGMA_LIK)
    return float(np.mean(np.abs(logpi_true[unique_idx] - logpi_pred)))


def candidate_ops(hidden_sizes: list[int]):
    ops = []
    if hidden_sizes:
        min_val = min(hidden_sizes)
        min_idx = hidden_sizes.index(min_val)
        ops.append(('double', min_idx))
    else:
        ops.append(('double', 0))

    best_idx = 0
    best_cost = None
    num_positions = len(hidden_sizes) + 1
    for idx in range(num_positions):
        if idx == 0:
            prev_dim = input_dim
            width = hidden_sizes[0] if hidden_sizes else 8
        else:
            prev_dim = hidden_sizes[idx - 1]
            width = hidden_sizes[idx - 1]
        next_dim = hidden_sizes[idx] if idx < len(hidden_sizes) else output_dim
        cost = width * (prev_dim + next_dim)
        if best_cost is None or cost < best_cost:
            best_cost = cost
            best_idx = idx
    ops.append(('insert', best_idx))
    return ops


## Load Data

In [4]:

par, obs, y_obs, chain, props, logpi_true = load_data(DATA_PATH, SIGMA_PRIOR, SIGMA_LIK)
input_dim = par.shape[1]
output_dim = obs.shape[1]
print('Chain length:', chain.shape[0])


[data] Loaded 'logpi' from file.
[data] par shape   : (28324, 30)
[data] obs shape   : (28324, 52)
[data] y_obs shape : (52,)
[data] chain shape : (56646,)
[data] props shape : (56646,)
Chain length: 56646


## Growth Loop

In [5]:

current_hidden = [8]
model = clone_with_hidden(current_hidden).to(DEVICE)
results = []
records = []
train_limit = TRAIN_CHUNK
iteration = 0

while train_limit + VAL_CHUNK <= min(MAX_CHAIN_USED, chain.shape[0]):
    iteration += 1
    train_idx = collect_training_indices(chain, props, train_limit)
    X_train_raw = par[train_idx]
    y_train = obs[train_idx]
    logpi_train = logpi_true[train_idx]

    if USE_STANDARDIZATION:
        X_train, x_mean, x_std = standardize_features(X_train_raw)
    else:
        X_train = X_train_raw
        x_mean = np.zeros(X_train_raw.shape[1], dtype=X_train_raw.dtype)
        x_std = np.ones(X_train_raw.shape[1], dtype=X_train_raw.dtype)

    if REINIT_BEFORE_CHUNK:
        model.reinitialize()
    print()
    print(f"Iteration {iteration}: training samples [0, {train_limit}) with {train_idx.size} unique points")
    final_train_loss = train_mlp(
        model,
        X_train,
        y_train,
        device=DEVICE,
        max_adam_epochs=ADAM_EPOCHS,
        adam_lr=ADAM_LR,
        adam_patience=ADAM_PATIENCE,
        tol=1e-5,
        max_lbfgs_iter=LBFGS_STEPS,
        loss_name=TRAIN_LOSS,
        train_loops=TRAIN_LOOPS,
        batch_size=BATCH_SIZE,
        loss_domain=LOSS_DOMAIN,
        par_train_raw=X_train_raw,
        logpi_targets=logpi_train,
        y_obs=y_obs,
        sigma_prior=SIGMA_PRIOR,
        sigma_lik=SIGMA_LIK,
        batch_growth=BATCH_GROWTH,
        verbose=TRAIN_VERBOSE,
        loop_improvement_pct=LOOP_IMPROVEMENT_PCT,
    )

    base_error = logpi_l1_error(model, train_limit, VAL_CHUNK, x_mean, x_std)
    print(f"Base architecture {current_hidden} validation error: {base_error:.4e}")

    best_error = base_error
    improved = True
    growth_steps = []
    base_hidden = current_hidden.copy()

    while improved:
        improved = False
        summary_rows = [{
            'label': 'current',
            'op': 'current',
            'idx': '-',
            'hidden': current_hidden.copy(),
            'val_error': best_error,
        }]
        candidate_records = []

        for op, idx in candidate_ops(current_hidden):
            new_hidden, candidate_model = build_candidate(model, current_hidden, op, idx)
            candidate_model.to(DEVICE)
            if REINIT_BEFORE_CANDIDATE:
                candidate_model.reinitialize()
            train_mlp(
                candidate_model,
                X_train,
                y_train,
                device=DEVICE,
                max_adam_epochs=FINE_TUNE_ADAM_EPOCHS,
                adam_lr=FINE_TUNE_LR,
                adam_patience=FINE_TUNE_ADAM_PATIENCE,
                tol=1e-5,
                max_lbfgs_iter=FINE_TUNE_LBFGS_STEPS,
                loss_name=TRAIN_LOSS,
                train_loops=FINE_TUNE_LOOPS,
                batch_size=BATCH_SIZE,
                loss_domain=LOSS_DOMAIN,
                par_train_raw=X_train_raw,
                logpi_targets=logpi_train,
                y_obs=y_obs,
                sigma_prior=SIGMA_PRIOR,
                sigma_lik=SIGMA_LIK,
                batch_growth=BATCH_GROWTH,
                verbose=2,
                loop_improvement_pct=LOOP_IMPROVEMENT_PCT,
                fine_tune=True,
                fine_tune_adam_lr=FINE_TUNE_LR,
                fine_tune_adam_epochs=FINE_TUNE_ADAM_EPOCHS,
                fine_tune_lbfgs_steps=FINE_TUNE_LBFGS_STEPS,
                fine_tune_loops=FINE_TUNE_LOOPS,
            )
            cand_error = logpi_l1_error(candidate_model, train_limit, VAL_CHUNK, x_mean, x_std)
            print(f"  Candidate {op}@{idx} -> {new_hidden}, val error={cand_error:.4e}")
            candidate_records.append((cand_error, new_hidden, candidate_model, op, idx))
            summary_rows.append({
                'label': 'candidate',
                'op': op,
                'idx': idx,
                'hidden': new_hidden,
                'val_error': cand_error,
            })

        display(pd.DataFrame(summary_rows))

        candidate_records.sort(key=lambda item: item[0])
        if candidate_records:
            best_candidate_error, new_hidden, best_candidate_model, op, idx = candidate_records[0]
            required_drop = best_error * (CANDIDATE_IMPROVEMENT_PCT / 100.0)
            actual_drop = best_error - best_candidate_error
            if actual_drop > required_drop + IMPROVEMENT_TOL:
                best_error = best_candidate_error
                model = best_candidate_model
                current_hidden = new_hidden
                growth_steps.append((op, idx, best_error))
                print(f"  Growth accepted: {op} at {idx}, hidden -> {current_hidden}, val error={best_error:.4e}")
                improved = True
            else:
                print("  Candidate improvements below 5% threshold; no update accepted.")
        if not improved:
            print("  No candidate improved the validation error; ending growth search for this iteration.")

    master_len = max(0, chain.shape[0] - MASTER_VAL_START)
    master_error = logpi_l1_error(model, MASTER_VAL_START, master_len, x_mean, x_std)
    results.append({
        'iteration': iteration,
        'train_limit': train_limit,
        'hidden_sizes': current_hidden.copy(),
        'val_error': best_error,
        'growth_steps': growth_steps,
        'base_hidden': base_hidden,
        'master_val_error': master_error,
        'final_train_loss': final_train_loss,
    })

    records.append({
        'iteration': iteration,
        'train_limit': train_limit,
        'base_hidden_sizes': base_hidden,
        'final_hidden_sizes': current_hidden.copy(),
        'val_error': best_error,
        'master_val_error': master_error,
        'growth_steps': len(growth_steps),
        'final_train_loss': final_train_loss,
    })
    pd.DataFrame(records).to_csv(RESULTS_CSV, index=False)
    print(f"[save] Wrote {len(records)} records to {RESULTS_CSV}")

    train_limit += TRAIN_CHUNK

print('Completed iterations:', len(results))
results


Iteration 1: training samples [0, 2000) with 1001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 1.352769e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.132822e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.124621e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.215900e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.271229e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.246445e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.301393e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.259090e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   80 | loss(l1) = 1.247947e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   90 | loss(l1) = 1.281191e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch  100 | loss(l1) = 1.3147

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[8],6.096102
1,candidate,double,0,[16],5.884814
2,candidate,insert,0,"[8, 8]",7.35742


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 1 records to mlp_growth_progress6.csv

Iteration 2: training samples [0, 4000) with 2001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 1.210730e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.196624e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.207947e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.209368e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.362996e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.153854e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.175850e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.210315e-01 | lr = 1.000e-01
[train][lo

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[8],6.888242
1,candidate,double,0,[16],6.096348
2,candidate,insert,0,"[8, 8]",7.511242


  Growth accepted: double at 0, hidden -> [16], val error=6.0963e+00
[train][loop 1/80][Adam] plateau at epoch 102, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 100, reducing LR to 6.250e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 100, reducing LR to 1.563e-03 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 6/80] improvement 0.000e+00 below threshold (6.252e-05), stopping outer loops.
[train] Finished training with final train loss(L1) = 6.251905e-02
  Candidate double@0 -> [32], val error=5.2354e+00
[train][loop 1/80][Adam] plateau at epoch 124, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 376, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 172, reducing LR to 1.563e-03 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and s

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[16],6.096348
1,candidate,double,0,[32],5.235363
2,candidate,insert,0,"[16, 16]",6.816848


  Growth accepted: double at 0, hidden -> [32], val error=5.2354e+00
[train][loop 1/80][Adam] plateau at epoch 103, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 100, reducing LR to 6.250e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 100, reducing LR to 1.563e-03 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 6/80] improvement 4.251e-05 below threshold (6.203e-05), stopping outer loops.
[train] Finished training with final train loss(L1) = 6.198749e-02
  Candidate double@0 -> [64], val error=5.1245e+00
[train][loop 1/80][Adam] plateau at epoch 109, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 260, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 308, reducing LR to 1.563e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 310, reducing LR to 3.906e-04 and s

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[32],5.235363
1,candidate,double,0,[64],5.124544
2,candidate,insert,0,"[32, 32]",6.668934


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 2 records to mlp_growth_progress6.csv

Iteration 3: training samples [0, 6000) with 3001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 1.669804e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.730878e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.657403e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.582760e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.907791e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.734012e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.738696e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.687728e-01 | lr = 1.000e-01
[train][lo

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[32],5.869954
1,candidate,double,0,[64],5.833905
2,candidate,insert,0,"[32, 32]",7.404458


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 3 records to mlp_growth_progress6.csv

Iteration 4: training samples [0, 8000) with 4001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.466847e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.748731e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.345021e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.643692e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 3.999853e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.602494e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.192800e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.739081e-01 | lr = 1.000e-01
[train][lo

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[32],6.935124
1,candidate,double,0,[64],6.624225
2,candidate,insert,0,"[32, 32]",7.540686


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 4 records to mlp_growth_progress6.csv

Iteration 5: training samples [0, 10000) with 5001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 1.869282e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.876338e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.959406e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.591814e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.613827e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.755538e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.837550e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.742748e-01 | lr = 1.000e-01
[train][l

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[32],4.106234
1,candidate,double,0,[64],3.865955
2,candidate,insert,0,"[32, 32]",6.327036


  Growth accepted: double at 0, hidden -> [64], val error=3.8660e+00
[train][loop 1/80][Adam] plateau at epoch 102, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 267, reducing LR to 6.250e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 100, reducing LR to 1.563e-03 and stopping early.
[train][loop 7/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 12/80][Adam] plateau at epoch 100, reducing LR to 9.766e-05 and stopping early.
[train][loop 12/80] improvement 0.000e+00 below threshold (4.905e-05), stopping outer loops.
[train] Finished training with final train loss(L1) = 4.905437e-02
  Candidate double@0 -> [128], val error=3.8198e+00
[train][loop 1/80][Adam] plateau at epoch 173, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 100, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 100, reducing LR to 1.563e-03 an

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[64],3.865955
1,candidate,double,0,[128],3.819796
2,candidate,insert,0,"[64, 64]",6.72355


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 5 records to mlp_growth_progress6.csv

Iteration 6: training samples [0, 12000) with 6001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.000635e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 2.205556e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 2.351347e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 2.096809e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 2.253790e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 2.108571e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 2.191940e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 2.085893e-01 | lr = 1.000e-01
[train][l

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[64],5.053645
1,candidate,double,0,[128],4.307132
2,candidate,insert,0,"[64, 64]",7.710476


  Growth accepted: double at 0, hidden -> [128], val error=4.3071e+00
[train][loop 1/80][Adam] plateau at epoch 102, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 100, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 100, reducing LR to 1.563e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 5/80][Adam] plateau at epoch 212, reducing LR to 9.766e-05 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 221, reducing LR to 2.441e-05 and stopping early.
[train][loop 8/80][Adam] plateau at epoch 100, reducing LR to 6.104e-06 and stopping early.
[train][loop 8/80] improvement 1.416e-05 below threshold (6.909e-05), stopping outer loops.
[train] Finished training with final train loss(L1) = 6.907926e-02
  Candidate double@0 -> [256], val error=6.7591e+00
[train][loop 1/80][Adam] plateau at epoch 104, reducing LR to 2.500e-02 and

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],4.307132
1,candidate,double,0,[256],6.759067
2,candidate,insert,0,"[128, 128]",8.440277


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 6 records to mlp_growth_progress6.csv

Iteration 7: training samples [0, 14000) with 7001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.312061e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.996709e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.474409e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.368289e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.886575e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 4.076483e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.720270e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.856733e-01 | lr = 1.000e-01
[train][l

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],4.001214
1,candidate,double,0,[256],5.988451
2,candidate,insert,0,"[128, 128]",8.197822


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 7 records to mlp_growth_progress6.csv

Iteration 8: training samples [0, 16000) with 8001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.884371e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.074259e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.248367e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.344561e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.150409e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.180852e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.134464e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.235327e+00 | lr = 1.000e-01
[train][l

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.646393
1,candidate,double,0,[256],5.748693
2,candidate,insert,0,"[128, 128]",7.574783


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 8 records to mlp_growth_progress6.csv

Iteration 9: training samples [0, 18000) with 9001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.814572e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.023328e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.753803e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.378833e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.239818e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.959112e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.135861e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 4.055526e-01 | lr = 1.000e-01
[train][l

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.80239
1,candidate,double,0,[256],6.756341
2,candidate,insert,0,"[128, 128]",8.648681


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 9 records to mlp_growth_progress6.csv

Iteration 10: training samples [0, 20000) with 10001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.308827e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.439696e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.149331e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.151332e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 2.680430e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.120001e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 2.647801e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.128234e-01 | lr = 1.000e-01
[train]

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.164905
1,candidate,double,0,[256],6.261638
2,candidate,insert,0,"[128, 128]",7.987094


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 10 records to mlp_growth_progress6.csv

Iteration 11: training samples [0, 22000) with 11001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.337486e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 4.449847e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.807274e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.555649e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 2.852314e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.446882e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.638352e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.405745e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],4.245504
1,candidate,double,0,[256],7.298963
2,candidate,insert,0,"[128, 128]",9.038188


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 11 records to mlp_growth_progress6.csv

Iteration 12: training samples [0, 24000) with 12001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.776846e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.273094e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.202318e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.121544e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.109126e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.228256e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.194025e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.357849e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.996398
1,candidate,double,0,[256],5.720538
2,candidate,insert,0,"[128, 128]",7.142674


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 12 records to mlp_growth_progress6.csv

Iteration 13: training samples [0, 26000) with 13001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.525461e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 4.037839e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.986786e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.997600e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.384308e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.815188e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 4.398689e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.948933e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.860689
1,candidate,double,0,[256],6.361281
2,candidate,insert,0,"[128, 128]",8.287422


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 13 records to mlp_growth_progress6.csv

Iteration 14: training samples [0, 28000) with 14001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.772689e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 4.499234e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 4.508260e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.558391e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.009361e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.460444e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.136542e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.294178e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.772795
1,candidate,double,0,[256],6.219465
2,candidate,insert,0,"[128, 128]",7.583232


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 14 records to mlp_growth_progress6.csv

Iteration 15: training samples [0, 30000) with 15001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.149370e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.643452e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 4.422046e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.309278e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 3.453830e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 4.183537e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 4.502709e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.116370e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.709143
1,candidate,double,0,[256],8.8413
2,candidate,insert,0,"[128, 128]",10.764747


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 15 records to mlp_growth_progress6.csv

Iteration 16: training samples [0, 32000) with 16001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 4.514585e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.028584e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.084091e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.028129e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.387659e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.295653e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.086430e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.391720e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.69394
1,candidate,double,0,[256],5.585751
2,candidate,insert,0,"[128, 128]",7.417774


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 16 records to mlp_growth_progress6.csv

Iteration 17: training samples [0, 34000) with 17001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.594217e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 4.055934e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.870318e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 4.471633e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 3.119411e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 4.347822e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 4.225382e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 4.075288e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.90282
1,candidate,double,0,[256],7.547787
2,candidate,insert,0,"[128, 128]",9.43467


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 17 records to mlp_growth_progress6.csv

Iteration 18: training samples [0, 36000) with 18001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.930039e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 4.031126e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 4.400899e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.290138e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.082722e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 4.744028e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.977740e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.807392e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.104713
1,candidate,double,0,[256],5.903539
2,candidate,insert,0,"[128, 128]",7.141994


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 18 records to mlp_growth_progress6.csv

Iteration 19: training samples [0, 38000) with 19001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.399020e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.683862e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.734638e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.361300e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 3.069797e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 2.879631e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.203229e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.389829e-01 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],3.480636
1,candidate,double,0,[256],6.597919
2,candidate,insert,0,"[128, 128]",8.41646


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 19 records to mlp_growth_progress6.csv

Iteration 20: training samples [0, 40000) with 20001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 4.292771e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.175805e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 9.517254e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.189539e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.203356e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.342081e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.305665e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.224854e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[128],5.479678
1,candidate,double,0,[256],5.187667
2,candidate,insert,0,"[128, 128]",5.88799


  Growth accepted: double at 0, hidden -> [256], val error=5.1877e+00
[train][loop 1/80][Adam] plateau at epoch 108, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 116, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 267, reducing LR to 1.563e-03 and stopping early.
[train][loop 4/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 5/80][Adam] plateau at epoch 100, reducing LR to 9.766e-05 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 350, reducing LR to 2.441e-05 and stopping early.
[train][loop 7/80][Adam] plateau at epoch 436, reducing LR to 6.104e-06 and stopping early.
[train][loop 8/80] improvement 7.302e-05 below threshold (7.870e-05), stopping outer loops.
[train] Finished training with final train loss(L1) = 7.862220e-02
  Candidate double@0 -> [512], val error=5.0015e+00
[train][loop 1/80][Adam] plateau at epoch 105, reducing LR to 2.500e-02 and

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[256],5.187667
1,candidate,double,0,[512],5.00146
2,candidate,insert,0,"[256, 256]",5.780311


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 20 records to mlp_growth_progress6.csv

Iteration 21: training samples [0, 42000) with 21001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.483560e-01 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 1.729036e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 1.480797e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 1.641229e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 1.578827e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 1.786369e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 1.740997e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 1.814039e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[256],5.581118
1,candidate,double,0,[512],5.062775
2,candidate,insert,0,"[256, 256]",6.259364


  Growth accepted: double at 0, hidden -> [512], val error=5.0628e+00
[train][loop 1/80][Adam] plateau at epoch 142, reducing LR to 2.500e-02 and stopping early.
[train][loop 2/80][Adam] plateau at epoch 123, reducing LR to 6.250e-03 and stopping early.
[train][loop 3/80][Adam] plateau at epoch 183, reducing LR to 1.563e-03 and stopping early.
[train][loop 5/80][Adam] plateau at epoch 100, reducing LR to 3.906e-04 and stopping early.
[train][loop 6/80][Adam] plateau at epoch 294, reducing LR to 9.766e-05 and stopping early.
[train][loop 7/80][Adam] plateau at epoch 156, reducing LR to 2.441e-05 and stopping early.
[train][loop 8/80][Adam] plateau at epoch 220, reducing LR to 6.104e-06 and stopping early.
[train][loop 9/80][Adam] plateau at epoch 251, reducing LR to 1.526e-06 and stopping early.
[train][loop 10/80][Adam] plateau at epoch 398, reducing LR to 3.815e-07 and stopping early.
LR 3.815e-07 below 1e-6 threshold; terminating training early.
  Candidate double@0 -> [1024], val er

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[512],5.062775
1,candidate,double,0,[1024],5.608053
2,candidate,insert,0,"[512, 512]",5.972717


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 21 records to mlp_growth_progress6.csv

Iteration 22: training samples [0, 44000) with 22001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 3.213404e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.335065e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 2.990528e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.116826e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 2.887592e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.158740e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.359709e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.268893e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[512],6.29488
1,candidate,double,0,[1024],6.592698
2,candidate,insert,0,"[512, 512]",8.423126


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 22 records to mlp_growth_progress6.csv

Iteration 23: training samples [0, 46000) with 23001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 2.873316e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.060950e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 3.096624e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 3.341861e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 2.973185e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 3.265804e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 3.269570e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 3.191495e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[512],7.425686
1,candidate,double,0,[1024],7.607662
2,candidate,insert,0,"[512, 512]",8.548444


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 23 records to mlp_growth_progress6.csv

Iteration 24: training samples [0, 48000) with 24001 unique points
[train][loop 1/80] Starting Adam: epochs=500, loss=L1, lr=1.000e-01, batch=32, domain=obs
[train][loop 1/80][Adam] epoch    1 | loss(l1) = 4.100313e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   10 | loss(l1) = 3.995959e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   20 | loss(l1) = 4.511366e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   30 | loss(l1) = 5.112522e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   40 | loss(l1) = 4.060918e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   50 | loss(l1) = 4.606441e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   60 | loss(l1) = 4.677379e+00 | lr = 1.000e-01
[train][loop 1/80][Adam] epoch   70 | loss(l1) = 4.681300e+00 | lr = 1.000e-01
[train

Unnamed: 0,label,op,idx,hidden,val_error
0,current,current,-,[512],5.812919
1,candidate,double,0,[1024],5.997531
2,candidate,insert,0,"[512, 512]",7.0561


  Candidate improvements below 5% threshold; no update accepted.
  No candidate improved the validation error; ending growth search for this iteration.
[save] Wrote 24 records to mlp_growth_progress6.csv
Completed iterations: 24


[{'iteration': 1,
  'train_limit': 2000,
  'hidden_sizes': [8],
  'val_error': 6.096101686596767,
  'growth_steps': [],
  'base_hidden': [8],
  'master_val_error': 7.435866449598895,
  'final_train_loss': 0.0846823900938034},
 {'iteration': 2,
  'train_limit': 4000,
  'hidden_sizes': [32],
  'val_error': 5.235362686054427,
  'growth_steps': [('double', 0, 6.096348426835711),
   ('double', 0, 5.235362686054427)],
  'base_hidden': [8],
  'master_val_error': 5.4446796202632,
  'final_train_loss': 0.08948510140180588},
 {'iteration': 3,
  'train_limit': 6000,
  'hidden_sizes': [32],
  'val_error': 5.869954277613327,
  'growth_steps': [],
  'base_hidden': [32],
  'master_val_error': 5.2645015009086364,
  'final_train_loss': 0.06044767424464226},
 {'iteration': 4,
  'train_limit': 8000,
  'hidden_sizes': [32],
  'val_error': 6.935123895638419,
  'growth_steps': [],
  'base_hidden': [32],
  'master_val_error': 7.115518474307729,
  'final_train_loss': 0.09107577800750732},
 {'iteration': 5,
  

## Summary

In [6]:

pd.DataFrame([
    {
        'iteration': r['iteration'],
        'train_limit': r['train_limit'],
        'hidden_sizes': r['hidden_sizes'],
        'val_error': r['val_error'],
        'master_val_error': r['master_val_error'],
        'growth_steps': len(r['growth_steps']),
    }
    for r in results
])

Unnamed: 0,iteration,train_limit,hidden_sizes,val_error,master_val_error,growth_steps
0,1,2000,[8],6.096102,7.435866,0
1,2,4000,[32],5.235363,5.44468,2
2,3,6000,[32],5.869954,5.264502,0
3,4,8000,[32],6.935124,7.115518,0
4,5,10000,[64],3.865955,4.458826,1
5,6,12000,[128],4.307132,4.184264,1
6,7,14000,[128],4.001214,3.990749,0
7,8,16000,[128],5.646393,6.410048,0
8,9,18000,[128],3.80239,3.514516,0
9,10,20000,[128],5.164905,5.016933,0
