# Colab Training Notebook

Runs SPEC phases 3-7 with CatBoost on GPU and optional FAISS ANN.

Prerequisites:
- Set Colab runtime to GPU.
- Ensure a dataset file named `@NQ - 5 min - ETH.csv`.


In [1]:
# Environment setup: clone or cd into repo; set deterministic flags
import os, pathlib, subprocess
os.environ['PYTHONHASHSEED'] = '42'
os.environ['GPU_DETERMINISTIC'] = '1'
REPO_URL = 'https://github.com/Afeks214/ML-Algo.git'
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False
if IN_COLAB:
    os.chdir('/content')
    if not pathlib.Path('ML-Algo').exists():
        subprocess.check_call(['git', 'clone', REPO_URL])
    os.chdir('ML-Algo')
else:
    here = pathlib.Path.cwd()
    if here.name == 'notebooks':
        os.chdir(here.parent)
print('Working directory:', os.getcwd())


Working directory: /content/ML-Algo


In [2]:
# Install dependencies (editable package, CatBoost, FAISS if available)
import sys, pathlib
from importlib import invalidate_caches
try:
    from IPython import get_ipython
    ip = get_ipython()
    if ip is not None:
        ip.run_line_magic('pip', 'install --quiet --upgrade pip')
        ip.run_line_magic('pip', 'install --quiet -e .')
        ip.run_line_magic('pip', 'install --quiet catboost==1.2.5')
        # Try multiple FAISS variants for Colab compatibility
        ok = False
        for spec in ['faiss-gpu==1.7.4.post2', 'faiss-gpu', 'faiss-cpu==1.7.4', 'faiss-cpu==1.7.2']:
            if ok:
                break
            try:
                ip.run_line_magic('pip', f'install --quiet {spec}')
                import importlib
                importlib.invalidate_caches()
                import faiss  # type: ignore
                print('FAISS import OK via', spec)
                ok = True
            except Exception as e:
                print('FAISS install/verify failed for', spec, e)
    else:
        raise RuntimeError('IPython not available')
except Exception:
    import subprocess
    def pipi(*args):
        cmd = [sys.executable, '-m', 'pip', *args]
        subprocess.check_call(cmd)
    pipi('install', '--upgrade', 'pip')
    pipi('install', '-e', '.')
    pipi('install', 'catboost==1.2.5')
    for spec in ['faiss-gpu==1.7.4.post2', 'faiss-gpu', 'faiss-cpu==1.7.4', 'faiss-cpu==1.7.2']:
        try:
            pipi('install', spec)
            import importlib
            importlib.invalidate_caches()
            import faiss  # type: ignore
            print('FAISS import OK via', spec)
            break
        except Exception as e:
            print('FAISS install/verify failed for', spec, e)
invalidate_caches()
try:
    import ml_algo  # type: ignore
    print('ml_algo import OK from', ml_algo.__file__)
except ModuleNotFoundError:
    sys.path.insert(0, str(pathlib.Path.cwd()/'src'))
    import ml_algo  # type: ignore
    print('ml_algo import (src fallback) OK from', ml_algo.__file__)


  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
  Building editable for ml_algo (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu==1.7.4.post2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu==1.7.4.post2[0m[31m
[0mFAISS install/verify failed for faiss-gpu==1.7.4.post2 No module named 'faiss'
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0mFAISS install/verify failed for faiss-gpu No module named 'faiss'
[31mERROR: Could not find a version that satisfies the requirement faiss-cpu==1.7.4 (from versions: 1.8.0, 1.8.0.post1, 1.9.0, 1.9.0.pos

In [2]:
# Data staging: upload @NQ - 5 min - ETH.csv if not present
from pathlib import Path
Path('data/raw').mkdir(parents=True, exist_ok=True)
DATA_PATH = Path('data/raw/@NQ - 5 min - ETH.csv')
if not DATA_PATH.exists():
    try:
        from google.colab import files  # type: ignore
        print('Please upload the dataset file when prompted.')
        uploaded = files.upload()
        name = next(iter(uploaded))
        Path(name).rename(DATA_PATH)
        print('Saved uploaded file to', DATA_PATH)
    except Exception as e:
        raise FileNotFoundError(f'Dataset not found at {DATA_PATH} after upload attempt. Error: {e}') from e
else:
    print('Found existing dataset:', DATA_PATH)

# Explicitly verify the file exists after staging
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Dataset file not found at {DATA_PATH} after staging process.')
else:
    print('Dataset file confirmed to exist at:', DATA_PATH)

Please upload the dataset file when prompted.


Saving @NQ - 5 min - ETH.csv to @NQ - 5 min - ETH (1).csv
Saved uploaded file to data/raw/@NQ - 5 min - ETH.csv
Dataset file confirmed to exist at: data/raw/@NQ - 5 min - ETH.csv


In [4]:
# Optional: compute dataset hash for reproducibility
import hashlib
digest = hashlib.sha256(DATA_PATH.read_bytes()).hexdigest()
print('dataset_sha256=', digest)


dataset_sha256= e4acf0577c9867a04beca40ee15e0233b6efeb50f790e1aded4d4ef9a8524e08


In [None]:
# Run Phases 3-7 with GPU CatBoost and optional FAISS ANN
import json, time
import numpy as np, pandas as pd
from pathlib import Path as _Path
try:
    from ml_algo.ann_index import AnnConfig
    from ml_algo.data_ingest import GapPolicy
    from ml_algo.kernels import KernelEnsembleParams
    from ml_algo.model_catboost import CatBoostConfig
    from ml_algo.pipeline import run_phase3, run_phase4, run_phase5, run_phase6, run_phase7
    from ml_algo.robust_scaling import TylerConfig
except ModuleNotFoundError:
    import sys as _sys, pathlib as _pl
    _sys.path.insert(0, str(_pl.Path.cwd()/'src'))
    from ml_algo.ann_index import AnnConfig
    from ml_algo.data_ingest import GapPolicy
    from ml_algo.kernels import KernelEnsembleParams
    from ml_algo.model_catboost import CatBoostConfig
    from ml_algo.pipeline import run_phase3, run_phase4, run_phase5, run_phase6, run_phase7
    from ml_algo.robust_scaling import TylerConfig
def has_faiss():
    try:
        import faiss  # type: ignore
        return True
    except Exception:
        return False
ann_backend = 'faiss_ivf' if has_faiss() else 'exact'
print('Using ANN backend:', ann_backend)

# Ensure dataset path exists
try:
    DATA_PATH
except NameError:
    DATA_PATH = _Path('data/raw/@NQ - 5 min - ETH.csv')

# Explicitly check for the dataset file before proceeding
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Dataset file not found at {DATA_PATH}. Please ensure it is uploaded and staged correctly in the previous cell.')
else:
    print('Dataset file found at:', DATA_PATH)

phase3 = run_phase3(
    sources=[DATA_PATH],
    timezone='America/New_York',
    bar_sizes=['5min'],
    # Relax gap policy to avoid dropping many rows across weekends/holidays
    gap_policy=GapPolicy(max_gap_minutes=1000000),
    tyler_config=TylerConfig(rho=0.2, tol=1e-6, max_iter=400),
)
phase4 = run_phase4(
    phase3,
    ann_config=AnnConfig(
        backend=ann_backend,
        k_cand=1024 if ann_backend!='exact' else 64,
        nlist=256,
        nprobe=16,
        nprobe_max=64,
    ),
    k_final=64 if ann_backend!='exact' else 16,
    latency_budget_ms=10.0 if ann_backend!='exact' else None,
)
ha_close = phase3.ha['ha_close'].to_numpy()
labels = pd.Series((pd.Series(ha_close).shift(-1) > ha_close).astype(int).fillna(0).values, index=phase3.ha.index)
phase5 = run_phase5(
    phase4,
    labels=labels,
    kernel_params=KernelEnsembleParams(),
    train_model=False,
)
ts = int(time.time())
out_dir = _Path(f'artifacts/colab_run_{ts}')
phase6 = run_phase6(
    phase5,
    catboost_config=CatBoostConfig(task_type='GPU', devices='0', iterations=1200, depth=8),
    artifact_dir=out_dir,
)
phase7 = run_phase7(phase5)
(out_dir / 'validation_report.json').write_text(phase7.report.to_json())
print(json.dumps({
    'artifact_dir': str(out_dir),
    'ann_backend': ann_backend,
    'ann_recall': float(phase4.recall),
    'phase4_timings_ms': phase4.timings_ms,
    'phase4_fallbacks': phase4.fallback_counters,
    'phase6_metrics': phase6.metrics.as_dict(),
    'phase6_timings_ms': phase6.timings_ms,
    'phase6_fallbacks': phase6.fallback_counters,
    'phase7_summary': phase7.summary,
}, indent=2))

Using ANN backend: exact
Dataset file found at: data/raw/@NQ - 5 min - ETH.csv


In [2]:
# Zip artifacts and download (Colab)
from pathlib import Path as _Path
import shutil
import time

try:
    out_dir  # type: ignore[name-defined]
except NameError:
    print("out_dir not defined. Attempting to find the most recent artifacts directory.")
    base = _Path('artifacts')
    found = False
    for attempt in range(5): # Retry up to 5 times with a delay
        candidates = sorted([p for p in base.glob('colab_run_*') if p.is_dir()], key=lambda p: p.stat().st_mtime, reverse=True)
        if candidates:
            out_dir = candidates[0]
            print(f"Found artifacts directory: {out_dir}")
            found = True
            break
        else:
            print(f"Attempt {attempt+1}: No 'colab_run_*' directories found in {base}. Waiting...")
            time.sleep(2) # Wait for 2 seconds before retrying

    if not found:
        raise RuntimeError('No artifacts directory found after multiple attempts; run the training cell first.')

zip_path = f'{out_dir}.zip'
print(f"Creating zip archive at: {zip_path}")
shutil.make_archive(str(out_dir), 'zip', root_dir=str(out_dir))
print("Zip archive created.")

try:
    from google.colab import files  # type: ignore
    print(f"Downloading {zip_path}...")
    files.download(zip_path)
    print("Download initiated.")
except Exception:
    print('Zip created at', zip_path)

out_dir not defined. Attempting to find the most recent artifacts directory.
Attempt 1: No 'colab_run_*' directories found in artifacts. Waiting...
Attempt 2: No 'colab_run_*' directories found in artifacts. Waiting...
Attempt 3: No 'colab_run_*' directories found in artifacts. Waiting...
Attempt 4: No 'colab_run_*' directories found in artifacts. Waiting...
Attempt 5: No 'colab_run_*' directories found in artifacts. Waiting...


RuntimeError: No artifacts directory found after multiple attempts; run the training cell first.