In [1]:
import os
import yaml
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from astropy.io import fits
from scipy.optimize import minimize
from pprint import pprint
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import math

print("Number of available GPUs: ", len(tf.config.list_physical_devices('GPU')))

# read configuration file
with open('config.yml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

2025-11-29 16:45:15.151189: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Number of available GPUs:  0


2025-11-29 16:45:45.177519: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [2]:
models = sorted(glob(os.path.join(config['data_dir'], 'models', '*.keras')))
pprint(models)

['/nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v1_ap99966643.keras',
 '/nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v2_ap99979441.keras',
 '/nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v3.keras',
 '/nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v4.keras',
 '/nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v5.keras']


In [3]:
model = keras.models.load_model(models[3])

In [4]:
import os
import json
import pickle
from datetime import datetime, timedelta
from glob import glob
import io

# cell can use existing variables: model, models, config
model_path = models[3]
print("Model file:", model_path)
print("File size (bytes):", os.path.getsize(model_path))
mtime = datetime.fromtimestamp(os.path.getmtime(model_path))
print("Last modified:", mtime.isoformat())

# basic model info
print("\nModel name:", getattr(model, "name", None))
try:
    print("Total params:", model.count_params())
except Exception:
    pass

# capture summary as text
try:
    buf = io.StringIO()
    model.summary(print_fn=lambda s: buf.write(s + "\n"))
    summary_text = buf.getvalue()
    print("\nModel summary:\n", summary_text)
except Exception as e:
    print("\nCould not get model.summary():", e)

# model config / architecture
try:
    cfg = model.get_config()
    print("Model config keys:", list(cfg.keys()) if isinstance(cfg, dict) else type(cfg))
except Exception as e:
    print("Could not get model.get_config():", e)

# optimizer / compile info
opt = getattr(model, "optimizer", None)
if opt is not None:
    try:
        print("\nOptimizer type:", type(opt))
        # many optimizers provide get_config()
        print("Optimizer config:", opt.get_config())
    except Exception as e:
        print("Could not read optimizer config:", e)
else:
    print("\nNo optimizer attached to loaded model.")

# attempt to find saved training history or logs next to the model file
base = os.path.splitext(os.path.basename(model_path))[0]
d = os.path.dirname(model_path)
candidates = []
# look for common history/log filenames
patterns = [
    os.path.join(d, base + "*.json"),
    os.path.join(d, base + "*.pkl"),
    os.path.join(d, base + "*.csv"),
    os.path.join(d, "*history*.json"),
    os.path.join(d, "*history*.pkl"),
    os.path.join(d, "*history*.csv"),
    os.path.join(d, "*.log"),
]
for p in patterns:
    candidates.extend(glob(p))
candidates = sorted(set(candidates))
print("\nPotential sidecar files near model:", candidates)

# helpers to extract duration info from files
def seconds_to_str(s):
    return str(timedelta(seconds=int(s)))

training_durations = []

# try JSON / pickle history files
for f in candidates:
    lower = f.lower()
    try:
        if lower.endswith(".json"):
            with open(f, "r") as fh:
                data = json.load(fh)
        elif lower.endswith(".pkl"):
            with open(f, "rb") as fh:
                data = pickle.load(fh)
        elif lower.endswith(".csv"):
            import pandas as pd  # already available but safe to import here
            df = pd.read_csv(f)
            # try to infer timestamps or wall_time
            ts_cols = [c for c in df.columns if "time" in c.lower() or "timestamp" in c.lower() or "wall" in c.lower()]
            if ts_cols:
                times = pd.to_datetime(df[ts_cols[0]], errors="coerce").dropna().astype(int) / 1e9
                if len(times) > 1:
                    dur = times.max() - times.min()
                    training_durations.append(("csv_timestamps", f, dur))
            # also try to infer if csv is a CSVLogger with 'epoch' rows and optional 'duration' column
            dur_cols = [c for c in df.columns if "duration" in c.lower() or "elapsed" in c.lower()]
            if dur_cols:
                total = df[dur_cols[0]].astype(float).sum()
                training_durations.append(("csv_duration_col", f, total))
            # store data for inspection
            data = {"csv_rows": len(df), "columns": list(df.columns)}
        else:
            continue

        # If data looks like a Keras history dict, it often has keys like 'loss' mapping to lists
        if isinstance(data, dict):
            if all(isinstance(v, list) for v in data.values()):
                epochs = len(next(iter(data.values())))
                print(f"\nFound history-like dict in {f}: epochs ~= {epochs}, keys: {list(data.keys())[:10]}")
                # some users store wall_time or durations in history
                for k in ("time", "duration", "wall_time", "elapsed"):
                    if k in data:
                        try:
                            dur = sum(data[k]) if isinstance(data[k], list) else float(data[k])
                            training_durations.append(("history_field", f, dur))
                        except Exception:
                            pass
            else:
                print(f"\nLoaded JSON/Pickle file {f}: keys:", list(data.keys())[:20])
    except Exception as e:
        print("Could not parse", f, ":", e)

# try TensorBoard event files in model dir (recursive)
tb_events = glob(os.path.join(d, "**", "events.out.tfevents.*"), recursive=True)
if tb_events:
    print("\nFound TensorBoard event files:", tb_events)
    for ev in tb_events:
        try:
            # use tensorflow's summary iterator to read wall_time from events
            times = []
            for e in tf.compat.v1.train.summary_iterator(ev):
                if hasattr(e, "wall_time"):
                    times.append(e.wall_time)
            if times:
                dur = max(times) - min(times)
                training_durations.append(("tensorboard_events", ev, dur))
        except Exception as e:
            print("Could not parse event file", ev, ":", e)

# if we didn't find explicit durations, try to infer from file timestamps of candidate files
if not training_durations and candidates:
    mtimes = [os.path.getmtime(f) for f in candidates]
    if mtimes:
        approx_dur = max(mtimes) - min(mtimes)
        training_durations.append(("file_timestamp_spread", "sidecars", approx_dur))

# print discovered durations
if training_durations:
    print("\nDiscovered training-duration estimates (source, file, seconds):")
    for src, f, sec in training_durations:
        print(f" - {src}: {f} -> {sec:.1f} sec (~{seconds_to_str(sec)})")
else:
    print("\nNo explicit training duration found in nearby files or tensorboard events.")
    print("You may have to save training history or logs during training (e.g., History object, CSVLogger, or TensorBoard events) to record wall-clock training time.")

Model file: /nfsdata1/bwedig/lsst-strong-lens-data-challenge/models/v4.keras
File size (bytes): 33629702
Last modified: 2025-10-11T20:14:06.304979

Model name: functional
Total params: 2793529



Model summary:
 Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ input_layer         │ (None, 41, 41, 5) │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ conv2d (Conv2D)     │ (None, 21, 21,    │     16,128 │ input_layer[0][0] │
│                     │ 128)              │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ batch_normalization │ (None, 21, 21,    │        512 │ conv2d[0][0]      │
│ (BatchNormalizatio… │ 128)              │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ activation          │ (None, 21, 21, 