In [1]:
import os, subprocess, sys
from pathlib import Path

REPO_URL = "https://github.com/DeogenesMaranan/ngiml"  # update to your fork if needed
REPO_DIR = Path("/content/ngiml")

if REPO_DIR.exists():
    subprocess.run(["git", "-C", str(REPO_DIR), "pull"], check=True)
else:
    subprocess.run(["git", "clone", REPO_URL, str(REPO_DIR)], check=True)

sys.path.insert(0, str(REPO_DIR))
print("Repo ready at", REPO_DIR)

In [2]:
import os
from pathlib import Path
from huggingface_hub import login, snapshot_download

HF_TOKEN = os.getenv("HF_TOKEN", "")
DATASET_REPO = "juhenes/ngiml"
DATASET_REVISION = "main"
DATA_DIR = "/content/data"

if HF_TOKEN:
    login(token=HF_TOKEN)

os.makedirs(DATA_DIR, exist_ok=True)
snapshot_download(
    repo_id=DATASET_REPO,
    repo_type="dataset",
    local_dir=DATA_DIR,
    revision=DATASET_REVISION,
    token=HF_TOKEN,
    resume_download=True,
)

root = Path(DATA_DIR)
manifest_files = sorted(
    p for p in root.rglob("manifest.*")
    if p.name in {"manifest.parquet", "manifest.json"}
)
tar_count = sum(1 for _ in root.rglob("*.tar")) + sum(1 for _ in root.rglob("*.tar.gz")) + sum(1 for _ in root.rglob("*.tgz"))

print("Dataset ready at", DATA_DIR)
print("Found manifests:", [str(p) for p in manifest_files[:5]])
print("Tar shards count:", tar_count)

In [3]:
from google.colab import drive
from pathlib import Path

# Mount Google Drive to store checkpoints/logs
DRIVE_MOUNT = "/content/drive"
OUTPUT_DIR = f"{DRIVE_MOUNT}/MyDrive/ngiml_runs"

drive.mount(DRIVE_MOUNT)
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print("Checkpoints will be written to", OUTPUT_DIR)


In [4]:
from pathlib import Path
import json
import dataclasses

from tools.colab_train_helpers import (
    find_or_resolve_manifest,
    build_default_components,
    build_training_config,
)

data_root = Path(DATA_DIR)
MANIFEST_PATH = find_or_resolve_manifest(data_root)

model_cfg, loss_cfg, default_aug, per_dataset_aug = build_default_components()
training_config = build_training_config(
    manifest_path=MANIFEST_PATH,
    output_dir=OUTPUT_DIR,
    model_cfg=model_cfg,
    loss_cfg=loss_cfg,
    default_aug=default_aug,
    per_dataset_aug=per_dataset_aug,
)

print(json.dumps(training_config, indent=2, default=lambda o: dataclasses.asdict(o) if dataclasses.is_dataclass(o) else str(o)))

In [None]:
import json
import dataclasses

from tools.colab_train_helpers import apply_colab_runtime_settings

# Set True to equalize per-dataset sampling frequency; False keeps natural dataset-size sampling.
BALANCE_SAMPLING = True
training_config = apply_colab_runtime_settings(training_config, balance_sampling=BALANCE_SAMPLING)

effective_view_multiplier = {
    name: cfg.views_per_sample if cfg.enable else 1
    for name, cfg in training_config.get("per_dataset_aug", {}).items()
}

print("Applied runtime settings:")
print({k: training_config[k] for k in [
    "num_workers",
    "persistent_workers",
    "pin_memory",
    "auto_local_cache",
    "local_cache_dir",
    "compile_model",
    "compile_mode",
    "channels_last",
    "use_tf32",
    "balance_sampling",
]})
print("Per-dataset views_per_sample:", effective_view_multiplier)

print("Effective training config (post-settings):")
print(json.dumps(training_config, indent=2, default=lambda o: dataclasses.asdict(o) if dataclasses.is_dataclass(o) else str(o)))

In [5]:
from importlib import reload
from tools import train_ngiml

# Ensure latest module state in this kernel
reload(train_ngiml)

cfg = train_ngiml.TrainConfig(**training_config)
train_ngiml.run_training(cfg)