# Multimodal Training Notebook (Audio + Video + Fusion)

This notebook runs a high-accuracy workflow with **video multiclass tuning first**:
1. Configure shared settings
2. Run video multiclass research sweep
3. Train/evaluate final video model with best config
4. Train/evaluate fusion model using trained audio + video checkpoints

In [6]:
# Cell 1 — Setup
import json, os, sys, shlex, subprocess, select, pty
from pathlib import Path

PROJECT_ROOT = Path.cwd()
PYTHON = sys.executable
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'master_config.json'

def _stream(cmd, cwd=None):
    cmd = [str(c) for c in cmd]
    print('$', ' '.join(shlex.quote(c) for c in cmd), flush=True)
    env = {**os.environ, 'PYTHONUNBUFFERED': '1'}

    master, slave = pty.openpty()
    p = subprocess.Popen(
        cmd, cwd=str(cwd or PROJECT_ROOT), env=env,
        stdin=slave, stdout=slave, stderr=slave, close_fds=True,
    )
    os.close(slave)

    while p.poll() is None:
        r, _, _ = select.select([master], [], [], 0.05)
        if r:
            try:
                data = os.read(master, 4096)
            except OSError as e:
                # PTY often raises EIO when child has exited; treat as EOF
                if getattr(e, 'errno', None) == 5:
                    break
                raise
            if not data:
                break
            sys.stdout.write(data.decode('utf-8', errors='replace'))
            sys.stdout.flush()

    try:
        while True:
            r, _, _ = select.select([master], [], [], 0.1)
            if not r:
                break
            try:
                data = os.read(master, 4096)
            except OSError as e:
                if getattr(e, 'errno', None) == 5:
                    break
                raise
            if not data:
                break
            sys.stdout.write(data.decode('utf-8', errors='replace'))
            sys.stdout.flush()
    except OSError:
        pass

    try:
        os.close(master)
    except OSError:
        pass

    rc = p.wait()
    print(f'\n[exit {rc}]', flush=True)
    if rc != 0:
        raise subprocess.CalledProcessError(rc, cmd)

with open(CONFIG_PATH) as f:
    CFG = json.load(f)

print(f'Python      : {PYTHON}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Config path : {CONFIG_PATH}')

Python      : /home/alolli/miniconda3/envs/therness_env/bin/python
Project root: /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito
Config path : /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/configs/master_config.json


In [None]:
# Cell 2 — High-accuracy multimodal config
# Edit only if needed, then run this cell once before training.

DATA_ROOT = '/data1/malto/therness/data/Hackathon'
NUM_CLASSES = 7

AUDIO_CKPT = 'checkpoints/audio_multiclass/best_model.pt'
VIDEO_CKPT = 'checkpoints/video/best_model.pt'
FUSION_CKPT = 'checkpoints/fusion/best_model.pt'

# Video training — loads pre-extracted JPEGs (run video/extract_frames.py once)
CFG.setdefault('video_window', {})
CFG['video_window'].setdefault('model', {})
CFG['video_window'].setdefault('training', {})
CFG['video_window']['model'].update({
    'pretrained': True,
    'dropout': 0.12,   # best region from latest research
})
CFG['video_window']['training'].update({
    'epochs': 70,
    'batch_size': 32,
    'lr': 1e-4,
    'weight_decay': 7e-05,
    'num_frames': 12,
    'clip_seconds': None,
    'img_size': 160,
    'frames_dir': 'data/video_frames',
    'seed': 42,
    'num_workers': 4,
    'metric': 'hackathon_combined',
    'class_weights': 'inverse_frequency',
    'class_weight_power': 1.0,
    'use_balanced_sampler': True,
    'balanced_sampler_power': 0.35,
    'split_strategy': 'group_shuffle',
    'patience': 20,
    'checkpoint_dir': 'checkpoints/video',
    'lr_schedule': {
        'warmup_ratio': 0.1,
        'plateau_factor': 0.5,
        'plateau_patience': 4,
        'plateau_threshold': 1e-3,
        'plateau_min_lr': 1e-6,
    },
})

# Fusion training (frozen backbones + fusion head)
CFG.setdefault('fusion', {})
CFG['fusion'].setdefault('model', {})
CFG['fusion'].setdefault('training', {})
CFG['fusion']['model'].update({
    'arch': 'temporal',
    'audio_dim': 128,
    'video_dim': 128,
    'hidden_dim': 192,
    'dropout': 0.2,
    'temporal_layers': 1,
})
CFG['fusion']['training'].update({
    'num_epochs': 90,
    'lr': 1e-4,
    'weight_decay': 1e-4,
    'batch_size': 64,
    'sequence_len': 12,
    'val_split': 0.2,
    'seed': 42,
    'patience': 22,
    'checkpoint_dir': 'checkpoints/fusion',
})

CFG['data_root'] = DATA_ROOT
CFG['num_classes'] = NUM_CLASSES
CFG['device'] = 'auto'

with open(CONFIG_PATH, 'w') as f:
    json.dump(CFG, f, indent=2)

print('Config updated.')
print(f'  video epochs      : {CFG["video_window"]["training"]["epochs"]}')
print(f'  video num_frames  : {CFG["video_window"]["training"]["num_frames"]}')
print(f'  video clip_seconds: {CFG["video_window"]["training"]["clip_seconds"]}')
print(f'  video img_size    : {CFG["video_window"]["training"]["img_size"]}')
print(f'  video dropout     : {CFG["video_window"]["model"]["dropout"]}')
print(f'  video lr          : {CFG["video_window"]["training"]["lr"]}')
print(f'  video wd          : {CFG["video_window"]["training"]["weight_decay"]}')
print(f'  fusion arch       : {CFG["fusion"]["model"]["arch"]}')
print(f'  fusion seq len    : {CFG["fusion"]["training"]["sequence_len"]}')
print(f'  fusion epochs     : {CFG["fusion"]["training"]["num_epochs"]}')
print(f'  audio ckpt: {AUDIO_CKPT}')
print(f'  video ckpt: {VIDEO_CKPT}')

Config updated.
  video epochs      : 70
  video num_frames  : 12
  video clip_seconds: None
  video img_size    : 160
  video dropout     : 0.12
  video lr          : 0.0001
  video wd          : 7e-05
  fusion epochs     : 90
  audio ckpt: checkpoints/audio_multiclass/best_model.pt
  video ckpt: checkpoints/video/best_model.pt


In [None]:
# Cell 3 — Resolve checkpoint paths (fallback to best video research ckpt)
import glob

if not (PROJECT_ROOT / AUDIO_CKPT).exists():
    raise FileNotFoundError(f'Audio checkpoint not found: {(PROJECT_ROOT / AUDIO_CKPT).resolve()}')

if not (PROJECT_ROOT / VIDEO_CKPT).exists():
    best_video_ckpt = None
    best_tuple = None
    for d in glob.glob(str(PROJECT_ROOT / 'checkpoints' / 'video_research_*')):
        m_path = Path(d) / 'best_metrics.json'
        p_path = Path(d) / 'best_model.pt'
        if not (m_path.exists() and p_path.exists()):
            continue
        with open(m_path) as f:
            m = json.load(f)
        macro_f1 = float(m.get('val_f1', -1.0))
        score = float(m.get('hackathon_score', -1.0))
        row = (macro_f1, score, str(p_path.relative_to(PROJECT_ROOT)))
        if best_tuple is None or row > best_tuple:
            best_tuple = row
            best_video_ckpt = row[2]

    if best_video_ckpt is None:
        raise FileNotFoundError(
            f'Video checkpoint not found at {(PROJECT_ROOT / VIDEO_CKPT).resolve()} and no video_research fallback available.'
        )

    VIDEO_CKPT = best_video_ckpt
    print('VIDEO_CKPT fallback selected from research checkpoints:')
    print(f'  {VIDEO_CKPT} (macroF1={best_tuple[0]:.4f}, score={best_tuple[1]:.4f})')
else:
    print(f'VIDEO_CKPT found: {(PROJECT_ROOT / VIDEO_CKPT).resolve()}')

print(f'Active AUDIO_CKPT: {(PROJECT_ROOT / AUDIO_CKPT).resolve()}')
print(f'Active VIDEO_CKPT: {(PROJECT_ROOT / VIDEO_CKPT).resolve()}')

In [8]:
# Cell 3 — Rebuild video frame cache to match training config
import shutil

RUN_FRAME_EXTRACTION = True
FORCE_CLEAN_FRAME_CACHE = True  # True = remove old JPEG cache before extraction
FRAME_EXTRACT_WORKERS = 16

if RUN_FRAME_EXTRACTION:
    with open(CONFIG_PATH) as f:
        cfg_live = json.load(f)

    vw_train = cfg_live.get('video_window', {}).get('training', {})
    frames_dir = vw_train.get('frames_dir', 'data/video_frames')
    num_frames = int(vw_train.get('num_frames', 8))
    img_size = int(vw_train.get('img_size', 160))

    if FORCE_CLEAN_FRAME_CACHE and (PROJECT_ROOT / frames_dir).exists():
        shutil.rmtree(PROJECT_ROOT / frames_dir)
        print(f'Removed old frame cache: {(PROJECT_ROOT / frames_dir).resolve()}')

    cmd = [
        PYTHON, '-u', '-m', 'video.extract_frames',
        '--data_root', cfg_live['data_root'],
        '--out_dir', str(PROJECT_ROOT / frames_dir),
        '--num_frames', str(num_frames),
        '--img_size', str(img_size),
        '--workers', str(FRAME_EXTRACT_WORKERS),
        '--overwrite',
    ]
    _stream(cmd)

    manifest_path = PROJECT_ROOT / frames_dir / 'manifest.json'
    if not manifest_path.exists():
        raise FileNotFoundError(f'Manifest missing after extraction: {manifest_path}')

    with open(manifest_path) as f:
        manifest = json.load(f)

    m_num_frames = int(manifest.get('num_frames', -1))
    n_entries = len(manifest.get('entries', []))
    if m_num_frames != num_frames:
        raise RuntimeError(
            f'Manifest num_frames mismatch: expected {num_frames}, got {m_num_frames}'
        )

    print('Frame extraction complete and validated.')
    print(f'  frames_dir : {(PROJECT_ROOT / frames_dir).resolve()}')
    print(f'  entries    : {n_entries}')
    print(f'  num_frames : {m_num_frames}')
    print(f'  img_size   : {manifest.get("img_size", "?")}')
else:
    print('RUN_FRAME_EXTRACTION=False — skipped.')

Removed old frame cache: /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/data/video_frames
$ /home/alolli/miniconda3/envs/therness_env/bin/python -u -m video.extract_frames --data_root /data1/malto/therness/data/Hackathon --out_dir /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/data/video_frames --num_frames 12 --img_size 160 --workers 16 --overwrite
Scanning videos in /data1/malto/therness/data/Hackathon...
       Scanning good_weld...
       Scanning defect-weld...
Found 1551 videos
Extracting: 100% 1551/1551 [01:11<00:00, 21.71it/s]

Done: 1551 ok, 0 failed
Frames saved to: /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/data/video_frames
Manifest written to: /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/data/video_frames/manifest.json

[exit 0]
Frame extraction complete and validated.
  frames_dir : /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/data/video_frames
  entries    : 1551
  num_frames : 12
  i

In [12]:
# Cell 4 — Video multiclass research sweep (auto frame sync + F1-aware ranking)
import copy
import shutil
import time
import torch

RUN_VIDEO_RESEARCH = True
FAST_MODE = True
MAX_TRIALS = 12
RESUME_RESEARCH = False

# Research target/selection
TARGET_HACKATHON_SCORE = 0.90
RESEARCH_OBJECTIVE = 'macro_f1'

# Auto-manage frame cache from current config before running trials
AUTO_SYNC_FRAME_CACHE = True
FRAME_EXTRACT_WORKERS = 16
FORCE_REEXTRACT_IF_MISMATCH = True

# Optional: apply best params and immediately start a long final training
RUN_VIDEO_FINAL_AFTER_RESEARCH = False
VIDEO_FINAL_EPOCHS = 140
RESET_VIDEO_FINAL_CKPT = True

if RUN_VIDEO_RESEARCH:
    with open(CONFIG_PATH) as f:
        base_cfg = json.load(f)

    RESEARCH_EPOCHS = 20 if FAST_MODE else 40
    RESEARCH_PATIENCE = 7 if FAST_MODE else 12

    def _ensure_frame_cache(cfg):
        vw = cfg.get('video_window', {}).get('training', {})
        frames_dir = vw.get('frames_dir', 'data/video_frames')
        num_frames = int(vw.get('num_frames', 8))
        img_size = int(vw.get('img_size', 160))
        manifest_path = PROJECT_ROOT / frames_dir / 'manifest.json'

        needs_extract = True
        reason = 'manifest missing'
        if manifest_path.exists():
            try:
                with open(manifest_path) as f:
                    manifest = json.load(f)
                m_num_frames = int(manifest.get('num_frames', -1))
                m_img_size = int(manifest.get('img_size', -1))
                n_entries = len(manifest.get('entries', []))
                if m_num_frames == num_frames and m_img_size == img_size and n_entries > 0:
                    needs_extract = False
                    reason = 'cache already matches config'
                else:
                    reason = (
                        f'cache mismatch (manifest num_frames={m_num_frames}, img_size={m_img_size}, entries={n_entries})'
                    )
            except Exception as e:
                reason = f'manifest unreadable ({e})'

        if not needs_extract:
            print(f'Frame cache check: OK — {reason}')
            return

        if not FORCE_REEXTRACT_IF_MISMATCH:
            raise RuntimeError(f'Frame cache invalid and FORCE_REEXTRACT_IF_MISMATCH=False: {reason}')

        print(f'Frame cache check: rebuilding — {reason}')
        cmd = [
            PYTHON, '-u', '-m', 'video.extract_frames',
            '--data_root', cfg['data_root'],
            '--out_dir', str(PROJECT_ROOT / frames_dir),
            '--num_frames', str(num_frames),
            '--img_size', str(img_size),
            '--workers', str(FRAME_EXTRACT_WORKERS),
            '--clean',
            '--overwrite',
        ]
        _stream(cmd)

        if not manifest_path.exists():
            raise FileNotFoundError(f'Manifest not found after extraction: {manifest_path}')

        with open(manifest_path) as f:
            manifest = json.load(f)
        m_num_frames = int(manifest.get('num_frames', -1))
        m_img_size = int(manifest.get('img_size', -1))
        n_entries = len(manifest.get('entries', []))
        if m_num_frames != num_frames or m_img_size != img_size or n_entries <= 0:
            raise RuntimeError(
                f'Invalid manifest after extraction: num_frames={m_num_frames}, img_size={m_img_size}, entries={n_entries}'
            )

        print('Frame cache rebuilt and validated.')
        print(f'  frames_dir : {(PROJECT_ROOT / frames_dir).resolve()}')
        print(f'  entries    : {n_entries}')
        print(f'  num_frames : {m_num_frames}')
        print(f'  img_size   : {m_img_size}')

    if AUTO_SYNC_FRAME_CACHE:
        _ensure_frame_cache(base_cfg)
    else:
        print('AUTO_SYNC_FRAME_CACHE=False — skipping frame cache sync.')

    # Focused around latest winner: lr=1e-4, wd=7e-05, dropout=0.12, cwp=1.0, bsp=0.35
    candidates = [
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.2e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 8.0e-5, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 5.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 1.0e-4, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.10, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.14, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 0.95, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.05, 'balanced_sampler_power': 0.35},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.30},
        {'lr': 1.0e-4, 'weight_decay': 7.0e-5, 'dropout': 0.12, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.40},
        {'lr': 1.2e-4, 'weight_decay': 1.0e-4, 'dropout': 0.10, 'class_weight_power': 1.00, 'balanced_sampler_power': 0.40},
    ]

    if FAST_MODE:
        candidates = candidates[:MAX_TRIALS]

    print(f"Video research: {len(candidates)} trials × {RESEARCH_EPOCHS} epochs")
    print(
        f"FAST_MODE={FAST_MODE} | RESUME_RESEARCH={RESUME_RESEARCH} | "
        f"OBJECTIVE={RESEARCH_OBJECTIVE} | TARGET_SCORE={TARGET_HACKATHON_SCORE:.2f} | "
        f"RUN_VIDEO_FINAL_AFTER_RESEARCH={RUN_VIDEO_FINAL_AFTER_RESEARCH}\n"
    )

    def _score_from_ckpt(best_pt, ckpt_dir):
        metrics_json = Path(ckpt_dir) / 'best_metrics.json'
        if metrics_json.exists():
            with open(metrics_json) as f:
                m = json.load(f)
            score = float(m.get('hackathon_score', m.get('val_f1', -1.0)))
            macro_f1 = float(m.get('val_f1', -1.0))
            epoch = int(m.get('epoch', -1))
            return score, macro_f1, epoch

        err = None
        for _ in range(3):
            try:
                try:
                    ck = torch.load(str(best_pt), map_location='cpu', weights_only=True)
                except TypeError:
                    ck = torch.load(str(best_pt), map_location='cpu')
                score = float(ck.get('hackathon_score', ck.get('val_f1', -1.0)))
                macro_f1 = float(ck.get('val_f1', -1.0))
                epoch = int(ck.get('epoch', -1))
                return score, macro_f1, epoch
            except Exception as e:
                err = e
                time.sleep(0.7)
        raise RuntimeError(f'Unable to read checkpoint after retries: {err}')

    def _maybe_read_ckpt(best_pt, ckpt_dir):
        if not best_pt.exists() and not (Path(ckpt_dir) / 'best_metrics.json').exists():
            return None
        try:
            return _score_from_ckpt(best_pt, ckpt_dir)
        except Exception as e:
            print(f"  ↳ Checkpoint/metrics unreadable ({e}); retraining this trial")
            return None

    def _objective_value(result_row):
        if RESEARCH_OBJECTIVE == 'hackathon_score':
            return float(result_row['score'])
        return float(result_row['macro_f1'])

    def _run_trial(params, idx, total):
        tag = (
            f"lr={params['lr']}_wd={params['weight_decay']}"
            f"_do={params['dropout']}_cwp={params['class_weight_power']}"
            f"_bsp={params['balanced_sampler_power']}"
        )
        ckpt_dir = f"checkpoints/video_research_{tag}"
        best_pt = Path(ckpt_dir) / 'best_model.pt'

        trial_cfg = copy.deepcopy(base_cfg)
        tw = trial_cfg['video_window']['training']
        tm = trial_cfg['video_window']['model']

        tw['epochs'] = RESEARCH_EPOCHS
        tw['patience'] = RESEARCH_PATIENCE
        tw['checkpoint_dir'] = ckpt_dir
        tw['lr'] = params['lr']
        tw['weight_decay'] = params['weight_decay']
        tw['class_weight_power'] = params['class_weight_power']
        tw['use_balanced_sampler'] = True
        tw['balanced_sampler_power'] = params['balanced_sampler_power']
        tm['dropout'] = params['dropout']

        with open(CONFIG_PATH, 'w') as f:
            json.dump(trial_cfg, f, indent=2)

        print(f"[{idx:02d}/{total:02d}] {params}")
        print(f"  → {ckpt_dir}")

        if RESUME_RESEARCH:
            prev = _maybe_read_ckpt(best_pt, ckpt_dir)
            if prev is not None:
                score, macro_f1, epoch = prev
                print("  ↳ Reusing existing checkpoint")
                return {**params, 'score': score, 'macro_f1': macro_f1, 'epoch': epoch, 'ckpt': ckpt_dir}

        try:
            _stream([PYTHON, '-u', '-m', 'video.run_video', '--config', str(CONFIG_PATH)])
            curr = _maybe_read_ckpt(best_pt, ckpt_dir)
            if curr is None:
                raise RuntimeError(f"best_model.pt/best_metrics.json missing in {ckpt_dir}")
            score, macro_f1, epoch = curr
            return {**params, 'score': score, 'macro_f1': macro_f1, 'epoch': epoch, 'ckpt': ckpt_dir}
        except Exception as e:
            print(f"  FAILED: {e}")
            return {**params, 'score': -1.0, 'macro_f1': -1.0, 'epoch': -1, 'ckpt': ckpt_dir}

    results = []
    for i, params in enumerate(candidates, start=1):
        row = _run_trial(params, i, len(candidates))
        results.append(row)
        if row['score'] >= TARGET_HACKATHON_SCORE:
            print(
                f"\nTarget reached (hackathon_score={row['score']:.4f} >= {TARGET_HACKATHON_SCORE:.2f}). Stopping sweep early."
            )
            break

    results.sort(key=lambda r: (_objective_value(r), r['score']), reverse=True)

    print("\n" + "=" * 120)
    print(f"VIDEO RESEARCH RESULTS — sorted by {RESEARCH_OBJECTIVE} (tie-break: hackathon_score)")
    print("=" * 120)
    for i, r in enumerate(results):
        mark = '★' if i == 0 else ' '
        print(
            f"  {mark} objective={_objective_value(r):.4f} score={r['score']:.4f} macroF1={r['macro_f1']:.4f} ep={r['epoch']:3d} "
            f"lr={r['lr']} wd={r['weight_decay']} do={r['dropout']} "
            f"cwp={r['class_weight_power']} bsp={r['balanced_sampler_power']}"
        )
    print("=" * 120)

    if not results or results[0]['score'] < 0:
        with open(CONFIG_PATH, 'w') as f:
            json.dump(base_cfg, f, indent=2)
        raise RuntimeError('All video research trials failed.')

    best = results[0]
    print("\nBest video config:")
    print(
        f"  objective={_objective_value(best):.4f} | score={best['score']:.4f} | macroF1={best['macro_f1']:.4f} | "
        f"lr={best['lr']} | wd={best['weight_decay']} | dropout={best['dropout']} | "
        f"class_weight_power={best['class_weight_power']} | balanced_sampler_power={best['balanced_sampler_power']}"
    )

    final_cfg = copy.deepcopy(base_cfg)
    fw = final_cfg['video_window']['training']
    fm = final_cfg['video_window']['model']

    fw['lr'] = best['lr']
    fw['weight_decay'] = best['weight_decay']
    fw['class_weight_power'] = best['class_weight_power']
    fw['use_balanced_sampler'] = True
    fw['balanced_sampler_power'] = best['balanced_sampler_power']
    fm['dropout'] = best['dropout']

    if RUN_VIDEO_FINAL_AFTER_RESEARCH:
        fw['epochs'] = VIDEO_FINAL_EPOCHS
        fw['patience'] = max(20, fw.get('patience', 12))
        fw['checkpoint_dir'] = 'checkpoints/video'

        final_ckpt_dir = Path(fw['checkpoint_dir'])
        if RESET_VIDEO_FINAL_CKPT and final_ckpt_dir.exists():
            shutil.rmtree(final_ckpt_dir)
            print(f"Removed checkpoint dir: {final_ckpt_dir.resolve()}")

        with open(CONFIG_PATH, 'w') as f:
            json.dump(final_cfg, f, indent=2)

        print(f"\nStarting final video training for {VIDEO_FINAL_EPOCHS} epochs...")
        _stream([PYTHON, '-u', '-m', 'video.run_video', '--config', str(CONFIG_PATH)])
        print('Final video training finished.')
    else:
        fw['checkpoint_dir'] = 'checkpoints/video'
        with open(CONFIG_PATH, 'w') as f:
            json.dump(final_cfg, f, indent=2)
        print('\nBest params written to master_config.json.')
        print('Run the next cell to train video with these best params.')
else:
    print('RUN_VIDEO_RESEARCH=False — skipped.')

Frame cache check: OK — cache already matches config
Video research: 12 trials × 20 epochs
FAST_MODE=True | RESUME_RESEARCH=False | OBJECTIVE=macro_f1 | TARGET_SCORE=0.90 | RUN_VIDEO_FINAL_AFTER_RESEARCH=False

[01/12] {'lr': 0.0001, 'weight_decay': 7e-05, 'dropout': 0.12, 'class_weight_power': 1.0, 'balanced_sampler_power': 0.35}
  → checkpoints/video_research_lr=0.0001_wd=7e-05_do=0.12_cwp=1.0_bsp=0.35
$ /home/alolli/miniconda3/envs/therness_env/bin/python -u -m video.run_video --config /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/configs/master_config.json


Device: cuda

Discovering video files in /data1/malto/therness/data/Hackathon...
       Scanning good_weld...
       Scanning defect-weld...
Found 1551 videos
  Code 00 (class 0): 731 videos
  Code 01 (class 1): 259 videos
  Code 02 (class 2): 169 videos
  Code 06 (class 3): 79 videos
  Code 07 (class 4): 158 videos
  Code 08 (class 5): 80 videos
  Code 11 (class 6): 75 videos
Train: 1268 videos | Val: 283 videos
Using pre-extracted frames from data/video_frames
Train: 1268 | Val: 283 | num_frames=12 [JPEG]
Balanced sampler: enabled (power=0.35, videos/epoch=1268)
Model parameters: 190,759
Class weights: ['0.150', '0.476', '0.718', '1.606', '0.740', '1.895', '1.414']
Config saved to checkpoints/video_research_lr=0.0001_wd=7e-05_do=0.12_cwp=1.0_bsp=0.35/config.json

  TRAINING START — 20 epochs
  Checkpoint dir: checkpoints/video_research_lr=0.0001_wd=7e-05_do=0.12_cwp=1.0_bsp=0.35


Epoch 1/20
----------------------------------------
Train loss: 1.9692 | Train F1: 0.0278 | Val loss: 2.

In [13]:
# Cell 4 — Export deploy .pt models on GPU (audio + video)
import copy
import torch

RUN_EXPORT_PT = True
EXPORT_DEVICE = 'cuda'   # force GPU export
AUDIO_DEPLOY_PT = 'checkpoints/audio_multiclass/deploy_multiclass.pt'
VIDEO_DEPLOY_PT = 'checkpoints/video/deploy_video.pt'

if RUN_EXPORT_PT:
    if EXPORT_DEVICE == 'cuda' and not torch.cuda.is_available():
        raise RuntimeError('EXPORT_DEVICE=cuda requested, but CUDA is not available.')

    with open(CONFIG_PATH) as f:
        cfg_live = json.load(f)

    # ---- Audio deploy TorchScript (.pt) ----
    _stream([
        PYTHON, '-u', '-m', 'audio.export_deploy_pt',
        '--checkpoint', AUDIO_CKPT,
        '--output', AUDIO_DEPLOY_PT,
        '--device', EXPORT_DEVICE,
    ])

    # ---- Video deploy TorchScript (.pt) ----
    from models.video_backbone import VideoCNNBackbone
    from video.video_processing import WeldVideoModel

    num_classes = int(cfg_live.get('num_classes', 7))
    dropout = float(cfg_live.get('video_window', {}).get('model', {}).get('dropout', 0.2))

    backbone = VideoCNNBackbone(num_classes=num_classes, dropout=dropout)
    model = WeldVideoModel(backbone)

    device = torch.device(EXPORT_DEVICE)
    try:
        ckpt = torch.load(VIDEO_CKPT, map_location=device, weights_only=True)
    except TypeError:
        ckpt = torch.load(VIDEO_CKPT, map_location=device)

    state = ckpt.get('model_state_dict', ckpt)
    model.load_state_dict(state)
    model.to(device).eval()

    class DeployVideoPT(torch.nn.Module):
        def __init__(self, base_model):
            super().__init__()
            self.base_model = base_model

        def forward(self, frames: torch.Tensor) -> torch.Tensor:
            # frames: (B, N, 3, H, W)
            return self.base_model(frames)

    deploy_video = DeployVideoPT(model).to(device).eval()
    example = torch.randn(1, 4, 3, 160, 160, device=device)
    scripted_video = torch.jit.trace(deploy_video, example)

    out_path = PROJECT_ROOT / VIDEO_DEPLOY_PT
    out_path.parent.mkdir(parents=True, exist_ok=True)
    scripted_video.save(str(out_path))

    print('Export finished.')
    print(f'  audio deploy pt: {(PROJECT_ROOT / AUDIO_DEPLOY_PT).resolve()}')
    print(f'  video deploy pt: {out_path.resolve()}')
    print(f'  export device  : {device}')
else:
    print('RUN_EXPORT_PT=False — skipped.')

$ /home/alolli/miniconda3/envs/therness_env/bin/python -u -m audio.export_deploy_pt --checkpoint checkpoints/audio_multiclass/best_model.pt --output checkpoints/audio_multiclass/deploy_multiclass.pt --device cuda
Export mode : multiclass (DeployMulticlassFile)
  num_classes    = 7
  export_device  = cuda
  chunk_samples  = 16000  (1.0s @ 16000 Hz)

Saved: /home/alolli/src/malto/hackathon/therness-hackaton-2026-polito/checkpoints/audio_multiclass/deploy_multiclass.pt
Methods available on loaded model:
  model(waveform)              → file-level prediction
  model.predict_window(window) → single-window prediction
  model.extract_window_activation(window)    → (128,) embedding
  model.extract_file_activations(waveform)   → (T, 128) embeddings
  model.extract_file_activation_mean(waveform) → (128,) embedding
  model.extract_window_activations(window)   → all stage/head activations
  model.extract_file_activation_summary(waveform) → mean stage/head activations

[exit 0]


FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/video/best_model.pt'

In [None]:
# Cell 4 — Train final video backbone (using best params from previous cell)
import shutil

RUN_VIDEO_TRAIN = True
RESET_VIDEO_CKPT = False

with open(CONFIG_PATH) as f:
    cfg_live = json.load(f)
video_ckpt_dir = Path(cfg_live['video_window']['training'].get('checkpoint_dir', 'checkpoints/video'))

if RUN_VIDEO_TRAIN:
    if RESET_VIDEO_CKPT and video_ckpt_dir.exists():
        shutil.rmtree(video_ckpt_dir)
        print(f"Removed checkpoint dir: {video_ckpt_dir.resolve()}")

    _stream([
        PYTHON, '-u', '-m', 'video.run_video',
        '--config', str(CONFIG_PATH),
    ])
else:
    print('RUN_VIDEO_TRAIN=False — skipped.')

In [None]:
# Cell 4 — Evaluate video checkpoint
RUN_VIDEO_EVAL = True

if RUN_VIDEO_EVAL:
    _stream([
        PYTHON, '-u', '-m', 'video.run_video',
        '--config', str(CONFIG_PATH),
        '--test_only',
        '--checkpoint', VIDEO_CKPT,
    ])
else:
    print('RUN_VIDEO_EVAL=False — skipped.')

In [None]:
# Cell 5 — Train fusion model (true multimodal)
RUN_FUSION_TRAIN = True

if RUN_FUSION_TRAIN:
    _stream([
        PYTHON, '-u', '-m', 'fusion.run_fusion',
        '--config', str(CONFIG_PATH),
        '--audio_checkpoint', AUDIO_CKPT,
        '--video_checkpoint', VIDEO_CKPT,
    ])
else:
    print('RUN_FUSION_TRAIN=False — skipped.')

In [None]:
# Cell 6 — Evaluate fusion checkpoint
RUN_FUSION_EVAL = True

if RUN_FUSION_EVAL:
    _stream([
        PYTHON, '-u', '-m', 'fusion.run_fusion',
        '--config', str(CONFIG_PATH),
        '--audio_checkpoint', AUDIO_CKPT,
        '--video_checkpoint', VIDEO_CKPT,
        '--test_only',
        '--checkpoint', FUSION_CKPT,
    ])
else:
    print('RUN_FUSION_EVAL=False — skipped.')

In [None]:
# Cell 7 — Fusion diagnostics (full vs audio-only vs video-only)
import glob
import os
import re
import numpy as np
import torch
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from fusion.fusion_model import FusionModel, TemporalFusionModel
from fusion.run_fusion import (
    _load_audio_backbone, _load_video_backbone,
    extract_audio_embeddings, extract_video_embeddings,
    infer_file_label, _build_video_index, _match_video_files,
    load_config,
)

RUN_FUSION_DIAGNOSTICS = True
FUSION_DIAG_CHECKPOINT = FUSION_CKPT

if RUN_FUSION_DIAGNOSTICS:
    cfg = load_config(str(CONFIG_PATH))
    data_root = cfg['data_root']
    num_classes = int(cfg.get('num_classes', 7))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    fusion_cfg = cfg.get('fusion', {})
    fusion_model_cfg = fusion_cfg.get('model', {})
    fusion_train_cfg = fusion_cfg.get('training', {})
    audio_cfg = cfg['audio']['feature_params']
    video_train_cfg = cfg.get('video_window', {}).get('training', {})

    # Match run_fusion split to evaluate on the same val protocol
    all_audio_files = sorted(glob.glob(os.path.join(data_root, '**', '*.flac'), recursive=True))
    if not all_audio_files:
        raise FileNotFoundError(f'No .flac files in {data_root}')

    file_labels = [infer_file_label(f, data_root) for f in all_audio_files]
    label_names = sorted(set(file_labels))
    label_to_idx = {l: i for i, l in enumerate(label_names)}
    labels = [label_to_idx[l] for l in file_labels]

    seed = int(fusion_train_cfg.get('seed', 42))
    val_split = float(fusion_train_cfg.get('val_split', 0.2))
    train_files, val_files, train_labels, val_labels = train_test_split(
        all_audio_files, labels, test_size=val_split, random_state=seed, stratify=labels
    )

    # Load frozen backbones
    audio_dropout = float(cfg['audio']['model'].get('dropout', 0.15))
    video_dropout = float(cfg.get('video_window', {}).get('model', {}).get('dropout', 0.2))

    audio_model = _load_audio_backbone(AUDIO_CKPT, audio_cfg, audio_dropout, device)
    video_model = _load_video_backbone(VIDEO_CKPT, num_classes, video_dropout, device)

    # Build embeddings for the same split
    print('Extracting diagnostic embeddings...')
    train_audio_embs = extract_audio_embeddings(audio_model, train_files, audio_cfg, device)
    val_audio_embs = extract_audio_embeddings(audio_model, val_files, audio_cfg, device)

    video_index = _build_video_index(data_root)
    train_video_files, n_train_matched = _match_video_files(train_files, video_index)
    val_video_files, n_val_matched = _match_video_files(val_files, video_index)
    print(f'Train video match: {n_train_matched}/{len(train_files)}')
    print(f'Val video match  : {n_val_matched}/{len(val_files)}')

    train_video_embs = extract_video_embeddings(video_model, train_video_files, video_train_cfg, device)
    val_video_embs = extract_video_embeddings(video_model, val_video_files, video_train_cfg, device)

    # Load trained fusion head
    fusion_arch = str(fusion_model_cfg.get('arch', 'mlp')).lower()
    if fusion_arch in {'temporal', 'gru', 'sequence'}:
        fusion_model = TemporalFusionModel(
            audio_dim=int(fusion_model_cfg.get('audio_dim', 128)),
            video_dim=int(fusion_model_cfg.get('video_dim', 128)),
            hidden_dim=int(fusion_model_cfg.get('hidden_dim', 128)),
            num_classes=num_classes,
            dropout=float(fusion_model_cfg.get('dropout', 0.2)),
            num_layers=int(fusion_model_cfg.get('temporal_layers', 1)),
        ).to(device)
    else:
        fusion_model = FusionModel(
            audio_dim=int(fusion_model_cfg.get('audio_dim', 128)),
            video_dim=int(fusion_model_cfg.get('video_dim', 128)),
            hidden_dim=int(fusion_model_cfg.get('hidden_dim', 128)),
            num_classes=num_classes,
            dropout=float(fusion_model_cfg.get('dropout', 0.2)),
        ).to(device)

    try:
        ckpt = torch.load(FUSION_DIAG_CHECKPOINT, map_location=device, weights_only=True)
    except TypeError:
        ckpt = torch.load(FUSION_DIAG_CHECKPOINT, map_location=device)

    state = ckpt.get('model_state_dict', ckpt)
    fusion_model.load_state_dict(state)
    fusion_model.eval()

    good_weld_idx = int(ckpt.get('good_weld_idx', label_to_idx.get('good_weld', 0)))
    y_true = np.array(val_labels, dtype=int)

    @torch.no_grad()
    def _eval_mode(mode: str):
        a = val_audio_embs.clone()
        v = val_video_embs.clone()
        if mode == 'audio_only':
            v.zero_()
        elif mode == 'video_only':
            a.zero_()

        logits = fusion_model(a.to(device), v.to(device))
        preds = logits.argmax(dim=1).cpu().numpy()

        macro = f1_score(y_true, preds, average='macro', zero_division=0)
        binary_true = np.where(y_true == good_weld_idx, 0, 1)
        binary_pred = np.where(preds == good_weld_idx, 0, 1)
        binary = f1_score(binary_true, binary_pred, pos_label=1, zero_division=0)
        score = 0.6 * binary + 0.4 * macro
        return {'mode': mode, 'macro_f1': float(macro), 'binary_f1': float(binary), 'hackathon_score': float(score)}

    diag_rows = [_eval_mode('full'), _eval_mode('audio_only'), _eval_mode('video_only')]
    diag_rows = sorted(diag_rows, key=lambda r: r['hackathon_score'], reverse=True)

    print('\n' + '=' * 90)
    print('FUSION MODALITY DIAGNOSTICS (val split)')
    print('=' * 90)
    for row in diag_rows:
        print(
            f"  mode={row['mode']:10s} | score={row['hackathon_score']:.4f} | "
            f"macroF1={row['macro_f1']:.4f} | binaryF1={row['binary_f1']:.4f}"
        )
    print('=' * 90)

    FUSION_DIAG = {
        'train_audio_embs': train_audio_embs,
        'train_video_embs': train_video_embs,
        'train_labels': torch.tensor(train_labels, dtype=torch.long),
        'val_audio_embs': val_audio_embs,
        'val_video_embs': val_video_embs,
        'val_labels': torch.tensor(val_labels, dtype=torch.long),
        'good_weld_idx': good_weld_idx,
        'num_classes': num_classes,
        'rows': diag_rows,
    }
else:
    print('RUN_FUSION_DIAGNOSTICS=False — skipped.')

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/video/best_model.pt'

In [None]:
# Cell 8 — Autoencoder latent blend probe (audio+video activations)
import torch
import torch.nn as nn
from sklearn.metrics import f1_score

RUN_AE_BLEND_PROBE = True
AE_EPOCHS = 35
AE_BATCH_SIZE = 128
AE_LR = 1e-3
AE_RECON_WEIGHT = 0.15
AE_LATENT_DIM = 96

if RUN_AE_BLEND_PROBE:
    if 'FUSION_DIAG' not in globals():
        raise RuntimeError('Run Cell 7 first to build FUSION_DIAG embeddings.')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_a = FUSION_DIAG['train_audio_embs'].float()
    train_v = FUSION_DIAG['train_video_embs'].float()
    train_y = FUSION_DIAG['train_labels'].long()
    val_a = FUSION_DIAG['val_audio_embs'].float()
    val_v = FUSION_DIAG['val_video_embs'].float()
    val_y = FUSION_DIAG['val_labels'].long()
    good_weld_idx = int(FUSION_DIAG['good_weld_idx'])
    num_classes = int(FUSION_DIAG['num_classes'])

    train_x = torch.cat([train_a, train_v], dim=1)  # (N, 256)
    val_x = torch.cat([val_a, val_v], dim=1)

    class AEFusionProbe(nn.Module):
        def __init__(self, in_dim=256, latent_dim=96, num_classes=7, dropout=0.15):
            super().__init__()
            self.encoder = nn.Sequential(
                nn.Linear(in_dim, 192),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(192, latent_dim),
                nn.ReLU(),
            )
            self.decoder = nn.Sequential(
                nn.Linear(latent_dim, 192),
                nn.ReLU(),
                nn.Linear(192, in_dim),
            )
            self.classifier = nn.Linear(latent_dim, num_classes)

        def forward(self, x):
            z = self.encoder(x)
            recon = self.decoder(z)
            logits = self.classifier(z)
            return logits, recon

    model = AEFusionProbe(
        in_dim=train_x.shape[1],
        latent_dim=AE_LATENT_DIM,
        num_classes=num_classes,
        dropout=0.15,
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=AE_LR, weight_decay=1e-4)
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()

    best = {'score': -1.0, 'macro_f1': -1.0, 'binary_f1': -1.0, 'epoch': -1}

    n = train_x.shape[0]
    for epoch in range(1, AE_EPOCHS + 1):
        model.train()
        perm = torch.randperm(n)
        epoch_loss = 0.0

        for i in range(0, n, AE_BATCH_SIZE):
            idx = perm[i:i + AE_BATCH_SIZE]
            xb = train_x[idx].to(device)
            yb = train_y[idx].to(device)

            optimizer.zero_grad(set_to_none=True)
            logits, recon = model(xb)
            loss = ce_loss(logits, yb) + AE_RECON_WEIGHT * mse_loss(recon, xb)
            loss.backward()
            optimizer.step()
            epoch_loss += float(loss.item()) * xb.size(0)

        model.eval()
        with torch.no_grad():
            logits, _ = model(val_x.to(device))
            preds = logits.argmax(dim=1).cpu()

        y_true = val_y.numpy()
        y_pred = preds.numpy()

        macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
        bin_true = [0 if y == good_weld_idx else 1 for y in y_true]
        bin_pred = [0 if p == good_weld_idx else 1 for p in y_pred]
        binary = f1_score(bin_true, bin_pred, pos_label=1, zero_division=0)
        score = 0.6 * binary + 0.4 * macro

        if score > best['score']:
            best = {
                'score': float(score),
                'macro_f1': float(macro),
                'binary_f1': float(binary),
                'epoch': int(epoch),
            }

        if epoch == 1 or epoch % 5 == 0 or epoch == AE_EPOCHS:
            avg_loss = epoch_loss / max(n, 1)
            print(
                f"epoch={epoch:02d} loss={avg_loss:.4f} "
                f"val_score={score:.4f} macroF1={macro:.4f} binaryF1={binary:.4f}"
            )

    print('\n' + '=' * 90)
    print('AE LATENT BLEND PROBE — BEST VAL RESULT')
    print('=' * 90)
    print(
        f"  score={best['score']:.4f} | macroF1={best['macro_f1']:.4f} | "
        f"binaryF1={best['binary_f1']:.4f} | epoch={best['epoch']}"
    )
    print('=' * 90)

    FUSION_AE_PROBE_BEST = best
else:
    print('RUN_AE_BLEND_PROBE=False — skipped.')