In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import time

from data_loader_saver import load_data, save_data

In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf

# List all GPUs
gpus = tf.config.list_physical_devices('GPU')
print("GPUs found:", gpus)

# Quick check
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("GPU available:", len(gpus) > 0)
# Or get the default device name
print("Default GPU device:", tf.test.gpu_device_name())



GPUs found: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Built with CUDA: True
GPU available: True
Default GPU device: /device:GPU:0


In [3]:
m_h_datapath = "../data/project/DS-15-samples_mHeights"
n_k_m_datapath = "../data/project/DS-15-samples_n_k_m_P"

m_h_datapath_1 = "../data/project/training_samples_m_h.pkl"
n_k_m_datapath_1 = "../data/project/training_samples_n_k_m_G.pkl"

In [4]:
n_k_m_P, m_heights = load_data(n_k_m_datapath, m_h_datapath)
n_k_m_P_1, m_heights_1 = load_data(n_k_m_datapath_1, m_h_datapath_1)

print("n_k_m_P size:", len(n_k_m_P))
print("m_heights size:", len(m_heights))
print("n_k_m_P_1 size:", len(n_k_m_P_1))
print("m_heights_1 size:", len(m_heights_1))
print(f'{n_k_m_P[0:3]}')
print(f'{m_heights[0:3]}')
print(f'{n_k_m_P_1[0:3]}')
print(f'{m_heights_1[0:3]}')
n_k_m_P.extend(n_k_m_P_1)
m_heights.extend(m_heights_1)

n_k_m_P size: 32087
m_heights size: 32087
n_k_m_P_1 size: 90000
m_heights_1 size: 90000
[[9, 4, 2, array([[ 1, -2,  2, -1,  4],
       [ 3,  0, -2,  1,  3],
       [ 0,  4,  4,  4,  3],
       [-3, -1,  4,  1, -4]])], [9, 4, 3, array([[ 1, -2,  2, -1,  4],
       [ 3,  0, -2,  1,  3],
       [ 0,  4,  4,  4,  3],
       [-3, -1,  4,  1, -4]])], [9, 4, 4, array([[ 1, -2,  2, -1,  4],
       [ 3,  0, -2,  1,  3],
       [ 0,  4,  4,  4,  3],
       [-3, -1,  4,  1, -4]])]]
[11.472222222222221, 17.875, 30.0]
[(9, 4, 2, array([[-80,   0, -17, -22, -64],
       [ 92, -85,  88,  17,  73],
       [-45, -87,  -8,  15,   4],
       [-52,  68,  85, -72, -23]])), (9, 4, 2, array([[ 73,  94,  74,  45,  33],
       [ 49, -59, -93,  56,  27],
       [-82,  88,  11, -54,  46],
       [-71,  87, -97,  63, -45]])), (9, 4, 2, array([[ 15,  -6, -81, -64,   6],
       [-35, -87,  70,  41,  87],
       [-80,  21, -75, -29, -29],
       [ 72, -21,  57,  38, -10]]))]
[148.50647445945322, 251.7460975793681, 1

In [5]:
from model_mheight import to_log2_height, from_log2_height

log_targets = to_log2_height(np.asarray(m_heights, dtype=np.float32))
print(
    f"log2(m-height) stats -> min: {log_targets.min():.3f}, max: {log_targets.max():.3f}"
)


log2(m-height) stats -> min: 1.000, max: 24.118


In [6]:
# Train one model per (n, k, m) combo using notebook-local helper
from typing import Dict, Sequence, Tuple
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from model_mheight import SUPPORTED_COMBOS, build_model


def _collect_for_combo(
    combo: Tuple[int, int, int],
    samples: Sequence[Tuple[int, int, int, np.ndarray]],
    targets: Sequence[float],
) -> Tuple[np.ndarray, np.ndarray]:
    matrices = []
    values = []
    for sample, target in zip(samples, targets):
        if tuple(sample[:3]) == combo:
            matrices.append(np.asarray(sample[3], dtype=np.float32))
            values.append(np.asarray(target, dtype=np.float32))
    if not matrices:
        raise ValueError(f"No samples available for combo {combo}.")
    return np.stack(matrices), np.stack(values)


def select_combo_data(
    combo: Tuple[int, int, int],
    samples: Sequence[Tuple[int, int, int, np.ndarray]],
    targets: Sequence[float],
) -> Tuple[np.ndarray, np.ndarray]:
    if combo == (9, 4, 2):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 4, 3):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 4, 4):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 4, 5):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 5, 2):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 5, 3):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 5, 4):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 6, 2):
        return _collect_for_combo(combo, samples, targets)
    elif combo == (9, 6, 3):
        return _collect_for_combo(combo, samples, targets)
    else:
        raise ValueError(f"Unsupported combo {combo}.")


def split_dataset(
    matrices: np.ndarray,
    targets: np.ndarray,
) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
    total = matrices.shape[0]
    indices = np.arange(total)
    rng = np.random.default_rng(0)
    rng.shuffle(indices)

    val_count = int(np.round(total * 0.10))
    test_count = int(np.round(total * 0.05))

    val_count = int(np.clip(val_count, 0, total))
    test_count = int(np.clip(test_count, 0, total - val_count))

    if val_count == 0 and total >= 3:
        val_count = 1
    if test_count == 0 and total - val_count >= 3:
        test_count = 1

    train_count = total - val_count - test_count
    if train_count <= 0 and total > 0:
        if test_count > 0:
            test_count -= 1
        elif val_count > 0:
            val_count -= 1
        train_count = total - val_count - test_count

    val_indices = indices[:val_count]
    test_indices = indices[val_count : val_count + test_count]
    train_indices = indices[val_count + test_count :]

    train = (matrices[train_indices], targets[train_indices])
    val = (matrices[val_indices], targets[val_indices])
    test = (matrices[test_indices], targets[test_indices])

    if val[0].shape[0] == 0:
        val = train
    if test[0].shape[0] == 0:
        test = val
    return train, val, test


def train_model_for_combo(
    combo: Tuple[int, int, int],
    samples: Sequence[Tuple[int, int, int, np.ndarray]],
    targets: Sequence[float],
    epochs: int = 200,
    batch_size: int = 256,
    save_dir: str = "trained_models",
) -> Dict[str, float]:
    matrices, values = select_combo_data(combo, samples, targets)
    train, val, test = split_dataset(matrices, values)

    train_features, train_targets = train
    val_features, val_targets = val
    test_features, test_targets = test

    train_targets_log = to_log2_height(train_targets)
    val_targets_log = to_log2_height(val_targets)
    test_targets_log = to_log2_height(test_targets)

    os.makedirs(save_dir, exist_ok=True)

    model = build_model(k_dim=matrices.shape[1], column_count=matrices.shape[2])
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=25,
        mode="min",
        restore_best_weights=True,
        verbose=1,
    )
    reduce_lr = ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=15,
        mode="min",
        min_lr=1e-6,
        verbose=1,
    )
    checkpoint_path = os.path.join(save_dir, f"{combo[0]}_{combo[1]}_{combo[2]}_best.keras")
    checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=False,
        mode="min",
        verbose=1,
    )
    history = model.fit(
        train_features,
        train_targets_log,
        validation_data=(val_features, val_targets_log),
        batch_size=min(batch_size, train_features.shape[0]),
        epochs=epochs,
        callbacks=[reduce_lr, early_stopping, checkpoint],
        verbose=2,
    )

    train_loss = float(history.history["loss"][-1])
    val_loss = float(history.history["val_loss"][-1]) if "val_loss" in history.history else float("nan")
    test_eval = model.evaluate(test_features, test_targets_log, verbose=0)
    if isinstance(test_eval, (list, tuple)):
        test_loss = float(test_eval[0])
        test_rel_error = float(test_eval[1]) if len(test_eval) > 1 else float("nan")
    else:
        test_loss = float(test_eval)
        test_rel_error = float("nan")

    model_path = os.path.join(save_dir, f"{combo[0]}_{combo[1]}_{combo[2]}_model")
    model.save(model_path, include_optimizer=False)

    return {
        "train_loss": train_loss,
        "val_loss": val_loss,
        "test_loss": test_loss,
        "test_relative_loss": test_rel_error,
        "model_path": model_path,
    }


samples = list(n_k_m_P)
targets = list(m_heights)
metrics_by_combo = {}

for combo in SUPPORTED_COMBOS:
    print(f"Training model for combo {combo}...")
    metrics_by_combo[combo] = train_model_for_combo(
        combo=combo,
        samples=samples,
        targets=targets,
        epochs=200,
        batch_size=256,
        save_dir="trained_models",
    )

metrics_by_combo

Training model for combo (9, 4, 2)...
Epoch 1/200


I0000 00:00:1761260636.475814 1212177 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



Epoch 1: val_loss improved from inf to 1.68875, saving model to trained_models/9_4_2_best.keras
43/43 - 10s - loss: 3.2567 - symmetric_relative_loss_from_log_params: 2.1845 - val_loss: 1.6888 - val_symmetric_relative_loss_from_log_params: 0.3968 - lr: 3.0000e-04 - 10s/epoch - 226ms/step
Epoch 2/200

Epoch 2: val_loss improved from 1.68875 to 1.47618, saving model to trained_models/9_4_2_best.keras
43/43 - 1s - loss: 1.8757 - symmetric_relative_loss_from_log_params: 0.6562 - val_loss: 1.4762 - val_symmetric_relative_loss_from_log_params: 0.3953 - lr: 3.0000e-04 - 529ms/epoch - 12ms/step
Epoch 3/200

Epoch 3: val_loss did not improve from 1.47618
43/43 - 0s - loss: 1.6888 - symmetric_relative_loss_from_log_params: 0.5397 - val_loss: 1.5321 - val_symmetric_relative_loss_from_log_params: 0.4365 - lr: 3.0000e-04 - 443ms/epoch - 10ms/step
Epoch 4/200

Epoch 4: val_loss did not improve from 1.47618
43/43 - 0s - loss: 1.6012 - symmetric_relative_loss_from_log_params: 0.4785 - val_loss: 1.5702

INFO:tensorflow:Assets written to: trained_models/9_4_2_model/assets


Training model for combo (9, 4, 3)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.55622, saving model to trained_models/9_4_3_best.keras
42/42 - 8s - loss: 3.3391 - symmetric_relative_loss_from_log_params: 2.7930 - val_loss: 2.5562 - val_symmetric_relative_loss_from_log_params: 0.4394 - lr: 3.0000e-04 - 8s/epoch - 190ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.55622 to 2.51285, saving model to trained_models/9_4_3_best.keras
42/42 - 1s - loss: 2.5705 - symmetric_relative_loss_from_log_params: 0.7599 - val_loss: 2.5129 - val_symmetric_relative_loss_from_log_params: 0.4187 - lr: 3.0000e-04 - 537ms/epoch - 13ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.51285 to 2.50285, saving model to trained_models/9_4_3_best.keras
42/42 - 1s - loss: 2.5345 - symmetric_relative_loss_from_log_params: 0.6919 - val_loss: 2.5029 - val_symmetric_relative_loss_from_log_params: 0.4222 - lr: 3.0000e-04 - 505ms/epoch - 12ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.50285 

INFO:tensorflow:Assets written to: trained_models/9_4_3_model/assets


Training model for combo (9, 4, 4)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.80000, saving model to trained_models/9_4_4_best.keras
42/42 - 8s - loss: 3.9829 - symmetric_relative_loss_from_log_params: 3.6291 - val_loss: 2.8000 - val_symmetric_relative_loss_from_log_params: 2.1733 - lr: 3.0000e-04 - 8s/epoch - 191ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.80000 to 2.52568, saving model to trained_models/9_4_4_best.keras
42/42 - 1s - loss: 2.6544 - symmetric_relative_loss_from_log_params: 1.2596 - val_loss: 2.5257 - val_symmetric_relative_loss_from_log_params: 0.5466 - lr: 3.0000e-04 - 502ms/epoch - 12ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.52568 to 2.51068, saving model to trained_models/9_4_4_best.keras
42/42 - 0s - loss: 2.5616 - symmetric_relative_loss_from_log_params: 0.8886 - val_loss: 2.5107 - val_symmetric_relative_loss_from_log_params: 0.4998 - lr: 3.0000e-04 - 495ms/epoch - 12ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.51068 

INFO:tensorflow:Assets written to: trained_models/9_4_4_model/assets


Training model for combo (9, 4, 5)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 4.48380, saving model to trained_models/9_4_5_best.keras
39/39 - 8s - loss: 5.5796 - symmetric_relative_loss_from_log_params: 3.9911 - val_loss: 4.4838 - val_symmetric_relative_loss_from_log_params: 3.9719 - lr: 3.0000e-04 - 8s/epoch - 213ms/step
Epoch 2/200

Epoch 2: val_loss improved from 4.48380 to 2.80180, saving model to trained_models/9_4_5_best.keras
39/39 - 1s - loss: 3.8404 - symmetric_relative_loss_from_log_params: 3.5439 - val_loss: 2.8018 - val_symmetric_relative_loss_from_log_params: 1.6792 - lr: 3.0000e-04 - 507ms/epoch - 13ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.80180 to 2.65464, saving model to trained_models/9_4_5_best.keras
39/39 - 1s - loss: 2.7790 - symmetric_relative_loss_from_log_params: 1.4687 - val_loss: 2.6546 - val_symmetric_relative_loss_from_log_params: 0.8570 - lr: 3.0000e-04 - 519ms/epoch - 13ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.65464 

INFO:tensorflow:Assets written to: trained_models/9_4_5_model/assets


Training model for combo (9, 5, 2)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.51811, saving model to trained_models/9_5_2_best.keras
46/46 - 8s - loss: 3.4517 - symmetric_relative_loss_from_log_params: 2.6169 - val_loss: 2.5181 - val_symmetric_relative_loss_from_log_params: 0.4720 - lr: 3.0000e-04 - 8s/epoch - 176ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.51811 to 2.51217, saving model to trained_models/9_5_2_best.keras
46/46 - 1s - loss: 2.5494 - symmetric_relative_loss_from_log_params: 0.8196 - val_loss: 2.5122 - val_symmetric_relative_loss_from_log_params: 0.4285 - lr: 3.0000e-04 - 597ms/epoch - 13ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.51217 to 2.50801, saving model to trained_models/9_5_2_best.keras
46/46 - 1s - loss: 2.5353 - symmetric_relative_loss_from_log_params: 0.7315 - val_loss: 2.5080 - val_symmetric_relative_loss_from_log_params: 0.4249 - lr: 3.0000e-04 - 555ms/epoch - 12ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.50801 

INFO:tensorflow:Assets written to: trained_models/9_5_2_model/assets


Training model for combo (9, 5, 3)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.53436, saving model to trained_models/9_5_3_best.keras
45/45 - 8s - loss: 4.4727 - symmetric_relative_loss_from_log_params: 3.3307 - val_loss: 2.5344 - val_symmetric_relative_loss_from_log_params: 0.5415 - lr: 3.0000e-04 - 8s/epoch - 188ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.53436 to 1.81755, saving model to trained_models/9_5_3_best.keras
45/45 - 1s - loss: 2.2982 - symmetric_relative_loss_from_log_params: 1.0829 - val_loss: 1.8175 - val_symmetric_relative_loss_from_log_params: 0.5313 - lr: 3.0000e-04 - 571ms/epoch - 13ms/step
Epoch 3/200

Epoch 3: val_loss improved from 1.81755 to 1.74428, saving model to trained_models/9_5_3_best.keras
45/45 - 1s - loss: 2.1316 - symmetric_relative_loss_from_log_params: 0.9160 - val_loss: 1.7443 - val_symmetric_relative_loss_from_log_params: 0.5263 - lr: 3.0000e-04 - 568ms/epoch - 13ms/step
Epoch 4/200

Epoch 4: val_loss did not improve from 1

INFO:tensorflow:Assets written to: trained_models/9_5_3_model/assets


Training model for combo (9, 5, 4)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 3.75181, saving model to trained_models/9_5_4_best.keras
40/40 - 8s - loss: 5.1500 - symmetric_relative_loss_from_log_params: 3.9700 - val_loss: 3.7518 - val_symmetric_relative_loss_from_log_params: 3.7845 - lr: 3.0000e-04 - 8s/epoch - 209ms/step
Epoch 2/200

Epoch 2: val_loss improved from 3.75181 to 2.67046, saving model to trained_models/9_5_4_best.keras
40/40 - 1s - loss: 3.0149 - symmetric_relative_loss_from_log_params: 2.1203 - val_loss: 2.6705 - val_symmetric_relative_loss_from_log_params: 0.9833 - lr: 3.0000e-04 - 514ms/epoch - 13ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.67046 to 2.65315, saving model to trained_models/9_5_4_best.keras
40/40 - 1s - loss: 2.7593 - symmetric_relative_loss_from_log_params: 1.3811 - val_loss: 2.6531 - val_symmetric_relative_loss_from_log_params: 0.8323 - lr: 3.0000e-04 - 515ms/epoch - 13ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.65315 

INFO:tensorflow:Assets written to: trained_models/9_5_4_model/assets


Training model for combo (9, 6, 2)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.03735, saving model to trained_models/9_6_2_best.keras
65/65 - 8s - loss: 3.3215 - symmetric_relative_loss_from_log_params: 2.3429 - val_loss: 2.0374 - val_symmetric_relative_loss_from_log_params: 0.8505 - lr: 3.0000e-04 - 8s/epoch - 127ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.03735 to 1.99715, saving model to trained_models/9_6_2_best.keras
65/65 - 1s - loss: 2.2483 - symmetric_relative_loss_from_log_params: 1.0945 - val_loss: 1.9971 - val_symmetric_relative_loss_from_log_params: 0.8556 - lr: 3.0000e-04 - 772ms/epoch - 12ms/step
Epoch 3/200

Epoch 3: val_loss did not improve from 1.99715
65/65 - 1s - loss: 2.1404 - symmetric_relative_loss_from_log_params: 1.0038 - val_loss: 2.0095 - val_symmetric_relative_loss_from_log_params: 0.9047 - lr: 3.0000e-04 - 610ms/epoch - 9ms/step
Epoch 4/200

Epoch 4: val_loss did not improve from 1.99715
65/65 - 1s - loss: 2.0999 - symmetric_relative_

INFO:tensorflow:Assets written to: trained_models/9_6_2_model/assets


Training model for combo (9, 6, 3)...
Epoch 1/200

Epoch 1: val_loss improved from inf to 2.67325, saving model to trained_models/9_6_3_best.keras
49/49 - 8s - loss: 4.2483 - symmetric_relative_loss_from_log_params: 3.3018 - val_loss: 2.6733 - val_symmetric_relative_loss_from_log_params: 0.9219 - lr: 3.0000e-04 - 8s/epoch - 167ms/step
Epoch 2/200

Epoch 2: val_loss improved from 2.67325 to 2.67068, saving model to trained_models/9_6_3_best.keras
49/49 - 1s - loss: 2.7767 - symmetric_relative_loss_from_log_params: 1.4635 - val_loss: 2.6707 - val_symmetric_relative_loss_from_log_params: 0.9146 - lr: 3.0000e-04 - 590ms/epoch - 12ms/step
Epoch 3/200

Epoch 3: val_loss improved from 2.67068 to 2.65096, saving model to trained_models/9_6_3_best.keras
49/49 - 1s - loss: 2.7407 - symmetric_relative_loss_from_log_params: 1.3307 - val_loss: 2.6510 - val_symmetric_relative_loss_from_log_params: 0.8874 - lr: 3.0000e-04 - 625ms/epoch - 13ms/step
Epoch 4/200

Epoch 4: val_loss improved from 2.65096 

INFO:tensorflow:Assets written to: trained_models/9_6_3_model/assets


{(9, 4, 2): {'train_loss': 1.298595666885376,
  'val_loss': 2.362175464630127,
  'test_loss': 1.4758399724960327,
  'test_relative_loss': 0.3941768705844879,
  'model_path': 'trained_models/9_4_2_model'},
 (9, 4, 3): {'train_loss': 1.3114385604858398,
  'val_loss': 2.623443126678467,
  'test_loss': 1.4378770589828491,
  'test_relative_loss': 0.3515472412109375,
  'model_path': 'trained_models/9_4_3_model'},
 (9, 4, 4): {'train_loss': 2.4900708198547363,
  'val_loss': 2.5215470790863037,
  'test_loss': 2.4965763092041016,
  'test_relative_loss': 0.5790220499038696,
  'model_path': 'trained_models/9_4_4_model'},
 (9, 4, 5): {'train_loss': 2.0953991413116455,
  'val_loss': 2.235152006149292,
  'test_loss': 2.026305913925171,
  'test_relative_loss': 0.8139551877975464,
  'model_path': 'trained_models/9_4_5_model'},
 (9, 5, 2): {'train_loss': 1.4158297777175903,
  'val_loss': 2.526609420776367,
  'test_loss': 1.608811378479004,
  'test_relative_loss': 0.3781173825263977,
  'model_path': 'tr