# Colab Training Notebook

Runs SPEC phases 3–7 with CatBoost on GPU and optional FAISS ANN.

Prerequisites:
- Set Colab runtime to GPU.
- Have the dataset file named `@NQ - 5 min - ETH.csv`.


In [None]:
# Environment setup: clone or cd into repo; set deterministic flags
import os, pathlib, subprocess

os.environ["PYTHONHASHSEED"] = "42"
os.environ["GPU_DETERMINISTIC"] = "1"

REPO_URL = "https://github.com/Afeks214/ML-Algo.git"
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    import os
    os.chdir('/content')
    if not pathlib.Path('ML-Algo').exists():
        subprocess.check_call(['git', 'clone', REPO_URL])
    os.chdir('ML-Algo')
else:
    here = pathlib.Path.cwd()
    if here.name == 'notebooks':
        os.chdir(here.parent)

print('Working directory:', os.getcwd())


In [None]:
# Install dependencies (editable package, CatBoost, FAISS GPU with CPU fallback)
import sys, subprocess, pathlib
from importlib import import_module, invalidate_caches

def pipi(*args):
    cmd = [sys.executable, '-m', 'pip', *args]
    print('> ', ' '.join(cmd))
    subprocess.check_call(cmd)

# Prefer %pip in notebooks so sys.path is updated without restart
try:
    from IPython import get_ipython
    ip = get_ipython()
    if ip is not None:
        ip.run_line_magic('pip', 'install --upgrade pip')
        ip.run_line_magic('pip', 'install -e .')
        ip.run_line_magic('pip', 'install catboost==1.2.5')
        try:
            ip.run_line_magic('pip', 'install faiss-gpu==1.7.4.post2')
        except Exception as e:
            print('faiss-gpu install failed, trying faiss-cpu:', e)
            ip.run_line_magic('pip', 'install faiss-cpu==1.7.4')
    else:
        raise RuntimeError('IPython not available')
except Exception:
    pipi('install', '--upgrade', 'pip')
    pipi('install', '-e', '.')
    pipi('install', 'catboost==1.2.5')
    try:
        pipi('install', 'faiss-gpu==1.7.4.post2')
    except Exception as e:
        print('faiss-gpu install failed, falling back to faiss-cpu:', e)
        try:
            pipi('install', 'faiss-cpu==1.7.4')
        except Exception as e2:
            print('faiss installation skipped:', e2)

# Ensure ml_algo is importable (editable install path or src/ fallback)
invalidate_caches()
try:
    import ml_algo  # type: ignore
    print('ml_algo import OK from', ml_algo.__file__)
except ModuleNotFoundError:
    sys.path.insert(0, str(pathlib.Path.cwd() / 'src'))
    import ml_algo  # type: ignore
    print('ml_algo import (src fallback) OK from', ml_algo.__file__)


In [None]:
# Data staging: upload @NQ - 5 min - ETH.csv if not present
from pathlib import Path
Path('data/raw').mkdir(parents=True, exist_ok=True)
DATA_PATH = Path('data/raw/@NQ - 5 min - ETH.csv')
if not DATA_PATH.exists():
    try:
        from google.colab import files  # type: ignore
        print('Please upload the dataset file when prompted.')
        uploaded = files.upload()
        name = next(iter(uploaded))
        Path(name).rename(DATA_PATH)
        print('Saved to', DATA_PATH)
    except Exception as e:
        raise FileNotFoundError(f'Dataset not found at {DATA_PATH}. Upload required.') from e
else:
    print('Found dataset:', DATA_PATH)


In [None]:
# Optional: compute dataset hash for reproducibility
import hashlib
digest = hashlib.sha256(DATA_PATH.read_bytes()).hexdigest()
print('dataset_sha256=', digest)


In [None]:
# Run Phases 3–7 with GPU CatBoost and optional FAISS ANN
import json, time
import numpy as np, pandas as pd
from pathlib import Path as _Path
try:
    from ml_algo.ann_index import AnnConfig
    from ml_algo.data_ingest import GapPolicy
    from ml_algo.kernels import KernelEnsembleParams
    from ml_algo.model_catboost import CatBoostConfig
    from ml_algo.pipeline import run_phase3, run_phase4, run_phase5, run_phase6, run_phase7
    from ml_algo.robust_scaling import TylerConfig
except ModuleNotFoundError:
    import sys, pathlib as _pl
    sys.path.insert(0, str(_pl.Path.cwd()/'src'))
    from ml_algo.ann_index import AnnConfig
    from ml_algo.data_ingest import GapPolicy
    from ml_algo.kernels import KernelEnsembleParams
    from ml_algo.model_catboost import CatBoostConfig
    from ml_algo.pipeline import run_phase3, run_phase4, run_phase5, run_phase6, run_phase7
    from ml_algo.robust_scaling import TylerConfig

def has_faiss():
    try:
        import faiss  # type: ignore
        return True
    except Exception:
        return False

ann_backend = 'faiss_ivf' if has_faiss() else 'exact'
print('Using ANN backend:', ann_backend)

phase3 = run_phase3(
    sources=[DATA_PATH],
    timezone='America/New_York',
    bar_sizes=['5min'],
    gap_policy=GapPolicy(max_gap_minutes=60),
    tyler_config=TylerConfig(rho=0.2, tol=1e-6, max_iter=400),
)

phase4 = run_phase4(
    phase3,
    ann_config=AnnConfig(
        backend=ann_backend,
        k_cand=1024 if ann_backend!='exact' else 64,
        nlist=256,
        nprobe=16,
        nprobe_max=64,
    ),
    k_final=64 if ann_backend!='exact' else 16,
    latency_budget_ms=10.0 if ann_backend!='exact' else None,
)

ha_close = phase3.ha['ha_close'].to_numpy()
labels = pd.Series((pd.Series(ha_close).shift(-1) > ha_close).astype(int).fillna(0).values, index=phase3.ha.index)

phase5 = run_phase5(
    phase4,
    labels=labels,
    kernel_params=KernelEnsembleParams(),
    train_model=False,
)

ts = int(time.time())
out_dir = _Path(f'artifacts/colab_run_{ts}')
phase6 = run_phase6(
    phase5,
    catboost_config=CatBoostConfig(task_type='GPU', devices='0', iterations=1200, depth=8),
    artifact_dir=out_dir,
)

phase7 = run_phase7(phase5)
(out_dir / 'validation_report.json').write_text(phase7.report.to_json())

print(json.dumps({
    'artifact_dir': str(out_dir),
    'ann_backend': ann_backend,
    'ann_recall': float(phase4.recall),
    'phase4_timings_ms': phase4.timings_ms,
    'phase4_fallbacks': phase4.fallback_counters,
    'phase6_metrics': phase6.metrics.as_dict(),
    'phase6_timings_ms': phase6.timings_ms,
    'phase6_fallbacks': phase6.fallback_counters,
    'phase7_summary': phase7.summary,
}, indent=2))


In [None]:
# Zip artifacts and download (Colab)
from pathlib import Path as _Path
import shutil
try:
    out_dir  # type: ignore[name-defined]
except NameError:
    base = _Path('artifacts')
    candidates = sorted([p for p in base.glob('colab_run_*') if p.is_dir()], key=lambda p: p.stat().st_mtime, reverse=True)
    if not candidates:
        raise RuntimeError('No artifacts directory found; run the training cell first.')
    out_dir = candidates[0]
zip_path = f'{out_dir}.zip'
shutil.make_archive(str(out_dir), 'zip', root_dir=str(out_dir))
try:
    from google.colab import files  # type: ignore
    files.download(zip_path)
except Exception:
    print('Zip created at', zip_path)
