# Lovli Source-Gating Validation Run (Colab GPU)

This notebook runs the **v3 source-gating workflow** on Colab (H100/T4 compatible), so we avoid local RAM limits.

It runs:
- **Merge pre-built catalogs** from Drive (`law_catalog_nl.json` + `law_catalog.json`) â†’ `data/law_catalog.json`
- `scripts/validate_reindex.py`
- `scripts/analyze_law_contamination.py`
- `scripts/sweep_retrieval_thresholds.py`

The setup enables law routing + law coherence filtering, includes reranker-context/routing-dualpass toggles, writes both sweep artifacts (`eval/retrieval_sweep_results.json` + `eval/retrieval_sweep_summary.json`), and exports files for review.

## Run checklist

| When | Mode | Action |
|------|------|--------|
| After each tuning change | `quick_iteration` | Use reduced sample + grid for fast feedback (~2-5 min) |
| Before promotion / final decision | `full_validation` | Use full 98 questions + full grid + gate checks (~30-60 min) |

## 1. Runtime and Repository Setup

Use a **GPU runtime** before running this notebook (H100 preferred, T4 supported).

If you cloned with an older commit, restart runtime and rerun from the top.

In [None]:
%cd /content
!rm -rf lovli
!git clone https://github.com/AndreasRamsli/lovli.git
%cd /content/lovli

# Install project with dependencies required by validation scripts.
%pip install -q -U pip
%pip install -q -e .

# Safety net for environments where editable install path is delayed.
import sys
from pathlib import Path
src_path = str(Path('/content/lovli/src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

print('Setup complete')

In [None]:
import torch
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    name = torch.cuda.get_device_name(0)
    props = torch.cuda.get_device_properties(0)
    print(f'GPU: {name}')
    print(f'VRAM: {props.total_memory / (1024**3):.1f} GB')

## 2. Environment Configuration (v3 + routing/coherence)

In [None]:
import os
import getpass

# Required secrets: prompt securely in Colab.
os.environ['QDRANT_URL'] = os.environ.get('QDRANT_URL') or input('QDRANT_URL: ').strip()
os.environ['QDRANT_API_KEY'] = os.environ.get('QDRANT_API_KEY') or getpass.getpass('QDRANT_API_KEY: ').strip()
os.environ['OPENROUTER_API_KEY'] = os.environ.get('OPENROUTER_API_KEY') or getpass.getpass('OPENROUTER_API_KEY: ').strip()

# Collection defaults (override if needed).
os.environ.setdefault('QDRANT_COLLECTION_NAME', 'lovli_laws_v3')

# Keep traces off for speed/clean logs.
os.environ['LANGCHAIN_TRACING_V2'] = 'false'
os.environ['LANGSMITH_TRACING'] = 'false'
os.environ['SWEEP_SKIP_INDEX_SCAN'] = 'true'

# Sweep promotion thresholds for new production gate checks.
os.environ.setdefault('SWEEP_PROMOTION_MIN_IMPROVEMENT', '0.01')
os.environ.setdefault('SWEEP_PROMOTION_PRECISION_TOLERANCE', '0.005')
os.environ.setdefault('SWEEP_PROMOTION_NEGATIVE_TOLERANCE', '0.010')

# Versioned trust profiles: switch TRUST_PROFILE between balanced_v1 and strict_v1.
os.environ['TRUST_PROFILE_VERSION'] = '2026-02-17'
profile_name = os.environ.get('TRUST_PROFILE', 'balanced_v1')
profiles = {
    'balanced_v1': {
        'RETRIEVAL_K_INITIAL': '22',
        'RERANKER_CONFIDENCE_THRESHOLD': '0.35',
        'RERANKER_MIN_DOC_SCORE': '0.32',
        'RERANKER_AMBIGUITY_MIN_GAP': '0.05',
        'RERANKER_AMBIGUITY_TOP_SCORE_CEILING': '0.7',
        'LAW_ROUTING_FALLBACK_UNFILTERED': 'true',
    },
    'strict_v1': {
        'RETRIEVAL_K_INITIAL': '15',
        'RERANKER_CONFIDENCE_THRESHOLD': '0.45',
        'RERANKER_MIN_DOC_SCORE': '0.55',
        'RERANKER_AMBIGUITY_MIN_GAP': '0.10',
        'RERANKER_AMBIGUITY_TOP_SCORE_CEILING': '0.7',
        'LAW_ROUTING_FALLBACK_UNFILTERED': 'false',
    },
}
profile = profiles.get(profile_name, profiles['balanced_v1'])
os.environ['TRUST_PROFILE'] = profile_name if profile_name in profiles else 'balanced_v1'

# Shared law routing/coherence + new dual-pass/reranker context controls.
os.environ['LAW_ROUTING_ENABLED'] = 'true'
os.environ['LAW_CATALOG_PATH'] = 'data/law_catalog.json'
os.environ['LAW_ROUTING_PREFILTER_K'] = '80'
os.environ['LAW_ROUTING_RERANK_TOP_K'] = '6'
os.environ['LAW_ROUTING_MIN_CONFIDENCE'] = '0.30'
os.environ['LAW_ROUTING_UNCERTAINTY_TOP_SCORE_CEILING'] = '0.55'
os.environ['LAW_ROUTING_UNCERTAINTY_MIN_GAP'] = '0.04'
os.environ['LAW_ROUTING_FALLBACK_MAX_LAWS'] = '12'
os.environ['LAW_COHERENCE_FILTER_ENABLED'] = 'true'
os.environ['LAW_COHERENCE_MIN_LAW_COUNT'] = '2'
os.environ['LAW_COHERENCE_SCORE_GAP'] = '0.15'
os.environ['LAW_COHERENCE_RELATIVE_GAP'] = '0.05'
os.environ['LAW_COHERENCE_MAX_SCORE_WEIGHT'] = '0.6'
os.environ['LAW_COHERENCE_MIN_KEEP'] = '1'
os.environ['LAW_COHERENCE_DOMINANT_CONCENTRATION_THRESHOLD'] = '0.60'
os.environ.setdefault('RERANKER_CONTEXT_ENRICHMENT_ENABLED', 'true')
os.environ.setdefault('LAW_ROUTING_SUMMARY_DUALPASS_ENABLED', 'true')

# Two-speed: set RUN_MODE='quick_iteration' for fast feedback, 'full_validation' for final check.
os.environ.setdefault('RUN_MODE', 'full_validation')

# Apply selected profile values.
for key, value in profile.items():
    os.environ[key] = value

# Guard against accidental string values like 'None'.
raw = os.environ.get('SWEEP_SAMPLE_SIZE')
if raw is not None and raw.strip().lower() in {'', 'none', 'null'}:
    os.environ.pop('SWEEP_SAMPLE_SIZE', None)

print('TRUST_PROFILE          =', os.environ['TRUST_PROFILE'])
print('TRUST_PROFILE_VERSION  =', os.environ['TRUST_PROFILE_VERSION'])
print('QDRANT_COLLECTION_NAME =', os.environ['QDRANT_COLLECTION_NAME'])
print('LAW_ROUTING_ENABLED    =', os.environ['LAW_ROUTING_ENABLED'])
print('LAW_CATALOG_PATH       =', os.environ['LAW_CATALOG_PATH'])
print('LAW_ROUTING_PREFILTER  =', os.environ['LAW_ROUTING_PREFILTER_K'])
print('LAW_ROUTING_RERANK_K   =', os.environ['LAW_ROUTING_RERANK_TOP_K'])
print('LAW_ROUTING_CONF_MIN   =', os.environ['LAW_ROUTING_MIN_CONFIDENCE'])
print('LAW_ROUTE_UNCERT_CEIL  =', os.environ['LAW_ROUTING_UNCERTAINTY_TOP_SCORE_CEILING'])
print('LAW_ROUTE_UNCERT_GAP   =', os.environ['LAW_ROUTING_UNCERTAINTY_MIN_GAP'])
print('LAW_ROUTE_FALLBACK     =', os.environ['LAW_ROUTING_FALLBACK_UNFILTERED'])
print('LAW_COHERENCE_FILTER   =', os.environ['LAW_COHERENCE_FILTER_ENABLED'])
print('LAW_COHERENCE_CONC_THR =', os.environ['LAW_COHERENCE_DOMINANT_CONCENTRATION_THRESHOLD'])
print('RERANKER_CTX_ENRICH    =', os.environ['RERANKER_CONTEXT_ENRICHMENT_ENABLED'])
print('ROUTING_DUALPASS       =', os.environ['LAW_ROUTING_SUMMARY_DUALPASS_ENABLED'])
print('RUN_MODE               =', os.environ.get('RUN_MODE'))
print('SWEEP_SAMPLE_SIZE      =', os.environ.get('SWEEP_SAMPLE_SIZE'))

In [None]:
# Two-speed workflow: quick_iteration (fast feedback) vs full_validation (final check).
# Default changed to quick_iteration for faster iteration during testing/debugging.
# Also available: SWEEP_DEBUG_MODE=true for ultra-fast focused threshold testing.
RUN_MODE = os.environ.get('RUN_MODE', 'quick_iteration').strip().lower()  # or 'full_validation'
DEBUG_MODE = os.environ.get('SWEEP_DEBUG_MODE', 'false').strip().lower() in ('1', 'true', 'yes')

if RUN_MODE == 'quick_iteration':
    if DEBUG_MODE:
        # Ultra-fast: 15 questions, minimal grid for rapid threshold testing
        os.environ['SWEEP_SAMPLE_SIZE'] = '15'
        os.environ['CONTAMINATION_SAMPLE_SIZE'] = '15'
        os.environ['SWEEP_QUICK_GRID'] = 'false'  # Use debug grid
        os.environ['SWEEP_DEBUG_MODE'] = 'true'
        os.environ['SKIP_VALIDATE_REINDEX'] = 'true'
        os.environ['SWEEP_PARALLEL_PRECOMPUTE'] = 'true'
        os.environ['SWEEP_PARALLEL_WORKERS'] = '6'
        os.environ['SWEEP_CACHE_DIR'] = '/content/lovli/eval/sweep_cache'
        os.environ['SWEEP_CHECKPOINT'] = 'true'
        print('RUN_MODE = quick_iteration + DEBUG_MODE (sample=15, debug grid, 6 workers, cached precompute, skip validate)')
    else:
        os.environ['SWEEP_SAMPLE_SIZE'] = '20'
        os.environ['CONTAMINATION_SAMPLE_SIZE'] = '20'
        os.environ['SWEEP_QUICK_GRID'] = 'true'
        os.environ['SKIP_VALIDATE_REINDEX'] = 'true'  # Skip smoke check for faster iteration
        # GPU-accelerated reranker: parallel precompute now works well with multiple workers
        # since the reranker runs on GPU (much lower RAM per worker than CPU-based model).
        # With 80GB RAM + GPU, we can run 6 workers for true parallelism.
        os.environ['SWEEP_PARALLEL_PRECOMPUTE'] = 'true'
        os.environ['SWEEP_PARALLEL_WORKERS'] = '6'
        # Cache precomputed candidates to /content/lovli/eval/sweep_cache so reruns
        # skip the Qdrant + reranker step entirely when questions/commit haven't changed.
        os.environ['SWEEP_CACHE_DIR'] = '/content/lovli/eval/sweep_cache'
        os.environ['SWEEP_CHECKPOINT'] = 'true'
        print('RUN_MODE = quick_iteration (sample=20, single combo, 6 workers, cached precompute, skip validate)')
elif RUN_MODE == 'fast_debug':
    # Ultra-fast: 10 questions, single combo
    os.environ['SWEEP_SAMPLE_SIZE'] = '10'
    os.environ['CONTAMINATION_SAMPLE_SIZE'] = '10'
    os.environ['SWEEP_QUICK_GRID'] = 'true'
    os.environ['SKIP_VALIDATE_REINDEX'] = 'true'
    os.environ['SWEEP_PARALLEL_PRECOMPUTE'] = 'true'
    os.environ['SWEEP_PARALLEL_WORKERS'] = '4'
    os.environ['SWEEP_CACHE_DIR'] = '/content/lovli/eval/sweep_cache'
    os.environ['SWEEP_CHECKPOINT'] = 'false'
    print('RUN_MODE = fast_debug (sample=10, single combo, 4 workers, cached precompute, skip validate)')
else:
    os.environ.pop('SWEEP_SAMPLE_SIZE', None)
    os.environ.pop('CONTAMINATION_SAMPLE_SIZE', None)
    os.environ.pop('SWEEP_QUICK_GRID', None)
    os.environ.pop('SWEEP_DEBUG_MODE', None)
    os.environ.pop('SKIP_VALIDATE_REINDEX', None)
    # Full validation: parallel precompute with GPU - can use 8 workers with 80GB RAM
    os.environ['SWEEP_PARALLEL_PRECOMPUTE'] = 'true'
    os.environ['SWEEP_PARALLEL_WORKERS'] = '8'
    os.environ['SWEEP_CACHE_DIR'] = '/content/lovli/eval/sweep_cache'
    os.environ['SWEEP_CHECKPOINT'] = 'true'
    print('RUN_MODE = full_validation (full 98 questions, full grid, 8 workers)')

print('SWEEP_SAMPLE_SIZE =', os.environ.get('SWEEP_SAMPLE_SIZE'))

In [None]:
# Preflight: create run envelope and clear stale artifacts/logs.
%cd /content/lovli
from pathlib import Path
from datetime import datetime, timezone
import os

run_id = datetime.now(timezone.utc).strftime('colab_%Y%m%dT%H%M%SZ')
os.environ['LOVLI_RUN_ID'] = run_id
print('LOVLI_RUN_ID =', run_id)

preflight_targets = [
    Path('eval/law_contamination_report.json'),
    Path('eval/retrieval_sweep_results.json'),
    Path('eval/retrieval_sweep_summary.json'),
    Path('eval/logs/analyze_law_contamination.log'),
    Path('eval/logs/retrieval_sweep_quick.log'),
    Path('eval/logs/retrieval_sweep_full.log'),
    Path('eval/logs/regression_gates.log'),
]
for target in preflight_targets:
    if target.exists():
        target.unlink()
        print('removed', target)
    else:
        print('missing', target)

## 3. Mount Drive, Extract Data, Merge Catalog, Validate Reindex

In [None]:
%cd /content/lovli

import os
from pathlib import Path
from google.colab import drive
import json
import subprocess


def run_to_log(cmd: str, log_path: Path) -> int:
    log_path.parent.mkdir(parents=True, exist_ok=True)
    with open(log_path, 'w', encoding='utf-8') as log_file:
        proc = subprocess.run(
            cmd,
            shell=True,
            cwd='/content/lovli',
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
        )
    return proc.returncode


def print_log_matches(log_path: Path, patterns: list[str], limit: int = 50) -> None:
    if not log_path.exists():
        print(f'log missing: {log_path}')
        return
    lines = log_path.read_text(encoding='utf-8', errors='ignore').splitlines()
    kept = []
    for line in lines:
        if any(p in line for p in patterns):
            kept.append(line)
    print(f'--- {log_path.name} (key lines) ---')
    for line in kept[-limit:]:
        print(line)


# Mount Drive for access to the compressed dataset.
drive.mount('/content/drive')

# Skip unchanged prep when possible (set FORCE_REFRESH=true to always run).
FORCE_REFRESH = os.environ.get('FORCE_REFRESH', 'false').strip().lower() in ('1', 'true', 'yes')
SKIP_VALIDATE_REINDEX = os.environ.get('SKIP_VALIDATE_REINDEX', 'false').strip().lower() in ('1', 'true', 'yes')

# Update this path if your tar or catalogs are moved.
drive_data = Path('/content/drive/MyDrive/Colab Notebooks/Lovli/data')
tar_path = drive_data / 'lovli-data.tar.bz2'
catalog_nl_path = drive_data / 'law_catalog_nl.json'
catalog_sf_path = drive_data / 'law_catalog.json'

assert tar_path.exists(), f'Data tar not found: {tar_path}'
assert catalog_nl_path.exists(), f'law_catalog_nl.json not found; upload to {drive_data}'
assert catalog_sf_path.exists(), f'law_catalog.json not found; upload to {drive_data}'

# Extract into repo data/ folder (skip if data already present and not forcing).
nl_dir = Path('/content/lovli/data/nl')
sf_dir = Path('/content/lovli/data/sf')
need_extract = FORCE_REFRESH or not nl_dir.exists() or not sf_dir.exists()
if not need_extract:
    nl_count_pre = len(list(nl_dir.glob('*.xml')))
    sf_count_pre = len(list(sf_dir.glob('*.xml')))
    if nl_count_pre == 0 or sf_count_pre == 0:
        need_extract = True
if need_extract:
    subprocess.run("mkdir -p /content/lovli/data", shell=True, check=True)
    subprocess.run(
        f"tar -xjf '{tar_path}' -C /content/lovli --exclude='._*'",
        shell=True,
        check=True,
    )
    print('Extracted data from tar.')
else:
    print('Skipped extraction (data already present). Set FORCE_REFRESH=true to re-extract.')

nl_count = len(list(nl_dir.glob('*.xml'))) if nl_dir.exists() else 0
sf_count = len(list(sf_dir.glob('*.xml'))) if sf_dir.exists() else 0
print({'nl_xml_files': nl_count, 'sf_xml_files': sf_count})
assert nl_count > 0 and sf_count > 0, 'Expected both data/nl and data/sf to contain XML files.'

# Merge pre-built catalogs (skip if output newer than sources).
output_catalog = Path('/content/lovli/data/law_catalog.json')
need_merge = FORCE_REFRESH or not output_catalog.exists()
if output_catalog.exists() and catalog_nl_path.exists() and catalog_sf_path.exists():
    out_mtime = output_catalog.stat().st_mtime
    if out_mtime >= catalog_nl_path.stat().st_mtime and out_mtime >= catalog_sf_path.stat().st_mtime:
        need_merge = False
if need_merge:
    nl_catalog = json.loads(catalog_nl_path.read_text(encoding='utf-8'))
    sf_catalog = json.loads(catalog_sf_path.read_text(encoding='utf-8'))
    seen = set()
    merged = []
    for entry in nl_catalog + sf_catalog:
        law_id = (entry.get('law_id') or '').strip()
        if law_id and law_id not in seen:
            seen.add(law_id)
            merged.append(entry)
    output_catalog.parent.mkdir(parents=True, exist_ok=True)
    with open(output_catalog, 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    nl_with_summary = sum(1 for e in nl_catalog if e.get('summary'))
    sf_with_summary = sum(1 for e in sf_catalog if e.get('summary'))
    print(f'Merged catalog: {len(merged)} entries (nl={len(nl_catalog)}, sf={len(sf_catalog)})')
    print(f'With summaries: nl={nl_with_summary}, sf={sf_with_summary}')
else:
    print('Skipped catalog merge (output newer than sources).')

# Validate metadata + retrieval smoke checks (skip when SKIP_VALIDATE_REINDEX=true).
if SKIP_VALIDATE_REINDEX:
    print('Skipped validate_reindex (SKIP_VALIDATE_REINDEX=true).')
    rc = 0
else:
    validate_log = Path('/content/lovli/eval/logs/validate_reindex.log')
    rc = run_to_log(
        'python scripts/validate_reindex.py --collection lovli_laws_v3 --with-smoke',
        validate_log,
    )
    print('validate_reindex exit_code =', rc)
    print_log_matches(
        validate_log,
        patterns=['Collection:', 'total_points=', 'missing_doc_type=', 'smoke query=', 'Validation completed.'],
    )
    assert rc == 0, 'validate_reindex failed; inspect eval/logs/validate_reindex.log'

## 4. Law Contamination Analysis

In [None]:
%cd /content/lovli
from pathlib import Path
import json

contam_log = Path('/content/lovli/eval/logs/analyze_law_contamination.log')
rc = run_to_log(
    'python -u scripts/analyze_law_contamination.py --output eval/law_contamination_report.json',
    contam_log,
)
print('analyze_law_contamination exit_code =', rc)
print_log_matches(
    contam_log,
    patterns=['Processed', 'Saved contamination report', 'Contamination rate='],
)

report_path = Path('/content/lovli/eval/law_contamination_report.json')
if report_path.exists():
    report = json.loads(report_path.read_text(encoding='utf-8'))
    agg = report.get('aggregate', {})
    print('--- contamination aggregate ---')
    for key in [
        'total_questions',
        'contamination_rate',
        'singleton_foreign_rate',
        'unexpected_citation_rate',
        'mean_foreign_score_gap',
    ]:
        print(f'{key}: {agg.get(key)}')

assert rc == 0, 'analyze_law_contamination failed; inspect eval/logs/analyze_law_contamination.log'

## 5. Full Retrieval Sweep (Colab run)

In [None]:
%cd /content/lovli
import json
from pathlib import Path

full_sweep_log = Path('/content/lovli/eval/logs/retrieval_sweep_full.log')
rc = run_to_log('python -u scripts/sweep_retrieval_thresholds.py', full_sweep_log)

print('full sweep exit_code =', rc)
print_log_matches(
    full_sweep_log,
    patterns=['Saved results:', 'Saved ablation summary:', 'Promotion gate summary:', 'Top 5 configurations:'],
)

sweep_path = Path('/content/lovli/eval/retrieval_sweep_results.json')
summary_path = Path('/content/lovli/eval/retrieval_sweep_summary.json')

if sweep_path.exists():
    rows = json.loads(sweep_path.read_text(encoding='utf-8'))
    if rows:
        top = rows[0]
        print('--- full sweep top row ---')
        for key in [
            'is_profile_default_row',
            'recall_at_k',
            'recall_at_1',
            'recall_at_3',
            'recall_at_5',
            'mrr_at_5',
            'citation_precision',
            'unexpected_citation_rate',
            'source_boundary_mismatch_at_k',
            'law_contamination_rate',
            'law_coherence_filtered_count',
            'promotion_gate_pass',
            'balanced_score',
        ]:
            print(f'{key}: {top.get(key)}')

if summary_path.exists():
    summary = json.loads(summary_path.read_text(encoding='utf-8'))
    print('--- ablation summary ---')
    for key in ['run_id', 'rows_count', 'promotion_gate_pass_count', 'promotion_gate_total']:
        print(f'{key}: {summary.get(key)}')

assert rc == 0, 'full sweep failed; inspect eval/logs/retrieval_sweep_full.log'

In [None]:
# Artifact metadata compatibility check before regression gates.
%cd /content/lovli
import json
from pathlib import Path

report_path = Path('eval/law_contamination_report.json')
sweep_path = Path('eval/retrieval_sweep_results.json')
summary_path = Path('eval/retrieval_sweep_summary.json')

assert report_path.exists(), 'Missing contamination report artifact'
assert sweep_path.exists(), 'Missing sweep results artifact'
assert summary_path.exists(), 'Missing sweep summary artifact'

report = json.loads(report_path.read_text(encoding='utf-8'))
rows = json.loads(sweep_path.read_text(encoding='utf-8'))
summary = json.loads(summary_path.read_text(encoding='utf-8'))
assert rows, 'Sweep results are empty'

contam_meta = report.get('artifact_metadata', {})
sweep_meta = {
    'run_id': rows[0].get('run_id'),
    'git_commit': rows[0].get('git_commit'),
    'questions_sha256': rows[0].get('questions_sha256'),
    'question_count': rows[0].get('question_count'),
}
summary_meta = {
    'run_id': summary.get('run_id'),
    'git_commit': summary.get('git_commit'),
    'questions_sha256': summary.get('questions_sha256'),
    'question_count': summary.get('question_count'),
}

print('contamination metadata:', contam_meta)
print('sweep metadata:', sweep_meta)
print('summary metadata:', summary_meta)

for key in ['run_id', 'git_commit', 'questions_sha256', 'question_count']:
    left = contam_meta.get(key)
    right = sweep_meta.get(key)
    if left is None or right is None:
        print(f'skip metadata check for {key}: value missing')
        continue
    assert str(left) == str(right), (
        f'Artifact mismatch for {key}: contamination={left} sweep={right}'
    )

for key in ['run_id', 'git_commit', 'questions_sha256', 'question_count']:
    left = sweep_meta.get(key)
    right = summary_meta.get(key)
    if left is None or right is None:
        print(f'skip summary metadata check for {key}: value missing')
        continue
    assert str(left) == str(right), (
        f'Artifact mismatch for {key}: sweep={left} summary={right}'
    )

print('Artifact metadata compatibility check passed.')

In [None]:
# Acceptance checks + focused debug for fallback-stage route misses.
%cd /content/lovli
import json
from pathlib import Path

contam_path = Path('eval/law_contamination_report.json')
sweep_path = Path('eval/retrieval_sweep_results.json')
assert contam_path.exists(), 'Missing eval/law_contamination_report.json'
assert sweep_path.exists(), 'Missing eval/retrieval_sweep_results.json'

contam = json.loads(contam_path.read_text(encoding='utf-8'))
rows = json.loads(sweep_path.read_text(encoding='utf-8'))
assert rows, 'Sweep results are empty'

profile_name = (contam.get('trust_profile_name') or '').strip()
profile_rows = [r for r in rows if r.get('trust_profile_name') == profile_name] if profile_name else rows
assert profile_rows, f'No sweep rows found for profile={profile_name!r}'
default_rows = [r for r in profile_rows if bool(r.get('is_profile_default_row'))]
assert len(default_rows) == 1, f'Expected one profile default row, found {len(default_rows)}'
selected = default_rows[0]

agg = contam.get('aggregate', {})
for required_key in [
    'fallback_stage_counts',
    'route_miss_by_fallback_stage',
    'route_miss_rate_by_stage',
    'fallback_recovery_rate_by_stage',
    'route_miss_count_by_mode_stage',
]:
    assert required_key in agg, f'Missing aggregate metric: {required_key}'

print('=== Gate-selected default row ===')
for k in [
    'is_profile_default_row',
    'retrieval_k_initial',
    'retrieval_k',
    'reranker_confidence_threshold',
    'reranker_min_doc_score',
    'law_routing_fallback_unfiltered',
    'recall_at_k',
    'citation_precision',
    'unexpected_citation_rate',
    'false_positive_gating_rate',
    'balanced_score',
    'routing_uncertain_count',
    'fallback_stage1_accepted_count',
    'fallback_stage2_unfiltered_count',
]:
    print(f'{k}: {selected.get(k)}')

print('\n=== Route miss by fallback stage ===')
print('counts:', agg.get('route_miss_by_fallback_stage'))
print('rates :', agg.get('route_miss_rate_by_stage'))

print('\n=== Fallback recovery by stage ===')
print(agg.get('fallback_recovery_rate_by_stage'))

print('\n=== Top route-miss law pair confusions ===')
for row in (agg.get('top_route_miss_law_pair_confusions') or [])[:10]:
    print(row)

In [None]:
# Controlled rerun envelope: contamination -> sweep -> gates (v1,v2).
%cd /content/lovli
import os
import json
from pathlib import Path
from datetime import datetime, timezone

run_id = os.environ.get('LOVLI_RUN_ID') or datetime.now(timezone.utc).strftime('colab_%Y%m%dT%H%M%SZ')
os.environ['LOVLI_RUN_ID'] = run_id
profile_for_gates = (os.environ.get('TRUST_PROFILE') or 'balanced_v1').strip() or 'balanced_v1'
os.environ['TRUST_PROFILE'] = profile_for_gates
print('Controlled run envelope:', run_id)
print('Gate profile:', profile_for_gates)

logs_dir = Path('eval/logs')
logs_dir.mkdir(parents=True, exist_ok=True)

contam_log = logs_dir / 'analyze_law_contamination.log'
sweep_log = logs_dir / 'retrieval_sweep_full.log'
gates_v1_log = logs_dir / 'regression_gates_v1.log'
gates_v2_log = logs_dir / 'regression_gates_v2.log'

base_gate_cmd = (
    'python -u scripts/check_regression_gates.py '
    '--contamination-report eval/law_contamination_report.json '
    '--sweep-results eval/retrieval_sweep_results.json '
    '--baseline eval/baselines/production_trust_baseline_v1.json '
    f'--profile "{profile_for_gates}" '
)

rc_contam = run_to_log(
    'python -u scripts/analyze_law_contamination.py --output eval/law_contamination_report.json',
    contam_log,
)
rc_sweep = run_to_log('python -u scripts/sweep_retrieval_thresholds.py', sweep_log)
rc_v1 = run_to_log(base_gate_cmd + '--gate-tier v1', gates_v1_log)
rc_v2 = run_to_log(base_gate_cmd + '--gate-tier v2', gates_v2_log)

print('exit codes:', {
    'contamination': rc_contam,
    'sweep': rc_sweep,
    'gates_v1': rc_v1,
    'gates_v2': rc_v2,
})

for path in [contam_log, sweep_log, gates_v1_log, gates_v2_log]:
    assert path.exists(), f'Missing log: {path}'

print_log_matches(contam_log, patterns=['Run metadata:', 'Saved contamination report', 'Gate summary:'])
print_log_matches(sweep_log, patterns=['Run metadata:', 'Parity debug:', 'Parity divergence counts:', 'Saved results:'])
print_log_matches(gates_v1_log, patterns=['Artifact metadata:', 'Gate checks passed', 'Regression gates failed', 'All regression gates passed'])
print_log_matches(gates_v2_log, patterns=['Artifact metadata:', 'Gate checks passed', 'Regression gates failed', 'All regression gates passed'])

report = json.loads(Path('eval/law_contamination_report.json').read_text(encoding='utf-8'))
rows = json.loads(Path('eval/retrieval_sweep_results.json').read_text(encoding='utf-8'))
assert rows, 'Sweep results are empty'
profile = report.get('trust_profile_name')
default_rows = [r for r in rows if r.get('trust_profile_name') == profile and bool(r.get('is_profile_default_row'))]
assert len(default_rows) == 1, f'Expected one default row for profile={profile}, got {len(default_rows)}'
default_row = default_rows[0]

print('\n=== Controlled run assessment (profile default row) ===')
for key in ['recall_at_k', 'citation_precision', 'unexpected_citation_rate', 'false_positive_gating_rate', 'balanced_score', 'fallback_stage1_accepted_count', 'fallback_stage2_unfiltered_count']:
    print(f'{key}: {default_row.get(key)}')

agg = report.get('aggregate', {})
for key in ['route_miss_expected_law_rate', 'dominant_law_mismatch_rate', 'fallback_recovery_rate', 'route_miss_by_fallback_stage', 'route_miss_rate_by_stage']:
    print(f'{key}: {agg.get(key)}')

In [None]:
# Must-pass regression gates against versioned baseline.
# Note: controlled rerun cell above already runs v1/v2; keep this as a standalone quick v1 check.
%cd /content/lovli
from pathlib import Path

profile_for_gates = (os.environ.get('TRUST_PROFILE') or 'balanced_v1').strip() or 'balanced_v1'
os.environ['TRUST_PROFILE'] = profile_for_gates
print('Gate profile:', profile_for_gates)

gates_log = Path('/content/lovli/eval/logs/regression_gates.log')
rc = run_to_log(
    'python -u scripts/check_regression_gates.py '
    '--contamination-report eval/law_contamination_report.json '
    '--sweep-results eval/retrieval_sweep_results.json '
    '--baseline eval/baselines/production_trust_baseline_v1.json '
    f'--profile "{profile_for_gates}" '
    '--gate-tier v1',
    gates_log,
)
print('regression gates exit_code =', rc)
print_log_matches(
    gates_log,
    patterns=['Artifact metadata:', 'Gate sweep row selected', '[PASS]', '[FAIL]', 'Gate checks passed', 'All regression gates passed'],
    limit=120,
)
assert rc == 0, 'regression gates failed; inspect eval/logs/regression_gates.log'

## 6. Artifact Overview and Quick Metric Check

Run acceptance targets (balanced objective):
- `recall_at_k` should improve materially vs previous baseline (~0.146)
- `citation_precision` should increase from previous baseline (~0.073)
- `unexpected_citation_rate` should decrease
- `law_coherence_filtered_count` should be non-zero on full sweep
- `missing_doc_type` must remain `0`

In [None]:
%cd /content/lovli
!ls -lah eval

import json
from pathlib import Path

artifacts = [
    Path('data/law_catalog.json'),
    Path('eval/law_contamination_report.json'),
    Path('eval/retrieval_sweep_results.json'),
    Path('eval/retrieval_sweep_summary.json'),
]
for p in artifacts:
    print(f'{p}:', 'exists' if p.exists() else 'missing')

report_path = Path('eval/law_contamination_report.json')
if report_path.exists():
    report = json.loads(report_path.read_text(encoding='utf-8'))
    agg = report.get('aggregate', {})
    print('\nContamination aggregate:')
    for k in [
        'total_questions',
        'contamination_rate',
        'singleton_foreign_rate',
        'unexpected_citation_rate',
        'mean_foreign_score_gap',
    ]:
        print(f'  {k}: {agg.get(k)}')

sweep_path = Path('eval/retrieval_sweep_results.json')
if sweep_path.exists():
    rows = json.loads(sweep_path.read_text(encoding='utf-8'))
    if rows:
        top = rows[0]
        print('\nTop sweep row:')
        for k in [
            'recall_at_k',
            'recall_at_1',
            'recall_at_3',
            'recall_at_5',
            'mrr_at_5',
            'f1_at_k',
            'citation_precision',
            'unexpected_citation_rate',
            'source_boundary_mismatch_at_k',
            'law_contamination_rate',
            'law_coherence_filtered_count',
            'promotion_gate_pass',
        ]:
            print(f'  {k}: {top.get(k)}')

summary_path = Path('eval/retrieval_sweep_summary.json')
if summary_path.exists():
    summary = json.loads(summary_path.read_text(encoding='utf-8'))
    print('\nSweep summary:')
    for k in [
        'run_id',
        'rows_count',
        'promotion_gate_pass_count',
        'promotion_gate_total',
    ]:
        print(f'  {k}: {summary.get(k)}')

In [None]:
# Optional: download key artifacts from Colab runtime.
%cd /content/lovli
from pathlib import Path
from google.colab import files

download_targets = [
    Path('eval/law_contamination_report.json'),
    Path('eval/retrieval_sweep_results.json'),
    Path('eval/retrieval_sweep_summary.json'),
    Path('eval/logs/retrieval_sweep_full.log'),
]

for target in download_targets:
    if target.exists():
        print('downloading', target)
        files.download(str(target))
    else:
        print('missing', target)