# Research Notebook: Why LB Is Bad and What To Fix

This notebook is for **diagnostics-first** iterations on anti-fraud ranking.

Focus:
- validation mismatch (weekly vs last-day)
- label strategy / unlabeled impact
- leakage risk in graph-risk / sequence features
- representation gap between offline AP and leaderboard behavior


## Experiment Matrix (6 runs)

1. Base (time + amount)
2. Base + sequence
3. Base + graph-risk
4. Base + sequence + graph-risk
5. (red+yellow only) Base + sequence + graph-risk
6. (red+yellow only) + count-only graph features (no target-mean risk)


In [None]:
from pathlib import Path
import subprocess
import json
import pandas as pd

ROOT = Path('/workspace/competition')
RUNS_DIR = ROOT / 'artifacts' / 'runs'
RUNS_DIR.mkdir(parents=True, exist_ok=True)

FAST_MODE = True  # set False for full runs
MAX_LABELED = 20000 if FAST_MODE else None
MAX_UNLABELED = 20000 if FAST_MODE else None
MAX_TEST = 50000 if FAST_MODE else None

BASE_CMD = [
    'python', 'scripts/train_baseline.py',
    '--config', 'conf/pipeline.yaml',
    '--device', 'cuda',
]

def run_exp(name, extra_args):
    cmd = BASE_CMD + ['--run-name', name] + extra_args
    if MAX_LABELED is not None:
        cmd += ['--max-labeled-rows', str(MAX_LABELED)]
    if MAX_UNLABELED is not None:
        cmd += ['--max-unlabeled-rows', str(MAX_UNLABELED)]
    if MAX_TEST is not None:
        cmd += ['--max-test-rows', str(MAX_TEST)]
    print('RUN:', ' '.join(cmd))
    p = subprocess.run(cmd, cwd=ROOT, text=True, capture_output=True)
    print(p.stdout)
    if p.returncode != 0:
        print(p.stderr)
        raise RuntimeError(f'Run failed: {name}')

def load_summary(name):
    p = RUNS_DIR / name / 'summary.json'
    if not p.exists():
        return None
    return json.loads(p.read_text())


In [None]:
experiments = [
    # run_name, feature_desc, extra_args
    ('res_01_base', 'base', [
        '--disable-sequence', '--disable-pretrain-profile', '--graph-risk-mode', 'off'
    ]),
    ('res_02_base_seq', 'base+sequence', [
        '--disable-pretrain-profile', '--graph-risk-mode', 'off'
    ]),
    ('res_03_base_graph', 'base+graph', [
        '--disable-sequence', '--disable-pretrain-profile', '--graph-risk-mode', 'full'
    ]),
    ('res_04_base_seq_graph', 'base+sequence+graph', [
        '--disable-pretrain-profile', '--graph-risk-mode', 'full'
    ]),
    ('res_05_ry_only', 'red+yellow only + sequence+graph', [
        '--disable-pretrain-profile', '--graph-risk-mode', 'full', '--use-unlabeled', 'false'
    ]),
    ('res_06_ry_count_only', 'red+yellow only + sequence + graph-count-only', [
        '--disable-pretrain-profile', '--graph-risk-mode', 'count', '--use-unlabeled', 'false'
    ]),
]
experiments

In [None]:
# WARNING: this can take time.
# Uncomment to run all experiments:

# for name, _, extra in experiments:
#     run_exp(name, extra)


In [None]:
rows = []
for name, feature_desc, _ in experiments:
    s = load_summary(name)
    if s is None:
        continue
    rows.append({
        'run': name,
        'features': feature_desc,
        'train_rows': s.get('train_rows'),
        'include_unlabeled': s.get('include_unlabeled'),
        'weight_unlabeled_ratio': s.get('weight_unlabeled_ratio'),
        'AP_labeled': s.get('cv_ap_labeled_mean'),
        'AP_proxy_week': s.get('cv_ap_proxy_mean'),
        'AP_lastday': s.get('cv_ap_proxy_lastday_mean'),
        'graph_risk_mode': s.get('graph_risk_mode'),
        'sequence': s.get('use_sequence'),
    })

df = pd.DataFrame(rows).sort_values('AP_lastday', ascending=False)
df


In [None]:
# Save comparable table for sharing
out = ROOT / 'artifacts' / 'research_ablation_table.csv'
if 'df' in globals() and len(df) > 0:
    df.to_csv(out, index=False)
    print('saved:', out)
else:
    print('no rows yet')


## Leakage / Mismatch Checklist

- If `AP_proxy_week` is high but `AP_lastday` is low, CV still mismatches LB-like setup.
- If `graph` runs collapse vs sequence-only, inspect target-based graph risk leakage/overfit.
- If turning off unlabeled improves AP_lastday, current green sampling introduces bias.
- If train-like AP is fine but LB is poor, switch optimization target to last-day slice.


In [None]:
# Inspect one run in detail
RUN = 'res_04_base_seq_graph'
fold_path = RUNS_DIR / RUN / 'fold_metrics.csv'
if fold_path.exists():
    display(pd.read_csv(fold_path))
else:
    print('run not found:', RUN)
