# Lovli - Retrieval Threshold Sweep (Colab)

This notebook runs retrieval-only tuning in Colab using the production sweep script.

## What it does
- Loads the project and dependencies
- Uses `scripts/sweep_retrieval_thresholds.py` as single source of truth
- Runs compact 5-dimension sweep:
  - `retrieval_k_initial`
  - `reranker_confidence_threshold`
  - `reranker_min_doc_score`
  - `reranker_ambiguity_min_gap`
  - `reranker_ambiguity_top_score_ceiling`
- Saves results to `eval/retrieval_sweep_results.json`

## Selection objective
- Leakage-first: improve off-topic cleanliness first, then recall/coverage/precision.

## Expected outputs
- Ranked sweep table in notebook output
- JSON file with all combinations and metrics

In [None]:
# If running in Colab, clone your repo first (skip if already cloned).
import os
if not os.path.exists("/content/lovli"):
    !git clone https://github.com/AndreasRamsli/lovli.git
%cd /content/lovli

# Satisfy google-colab's requests pin to avoid dependency conflicts
!pip install -q "requests==2.32.4"
!pip install -q -e .

In [None]:
import json
import itertools
import logging
import math
import os
import sys
from pathlib import Path

import pandas as pd
import torch

# Reduce noisy tracing during local experiments
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGSMITH_TRACING"] = "false"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# Find repo root: Colab clone is /content/lovli; fallback to cwd if running from repo
ROOT_DIR = Path("/content/lovli")
if not (ROOT_DIR / "src" / "lovli").exists():
    for cand in [Path.cwd(), Path.cwd().parent]:
        if (cand / "src" / "lovli").exists():
            ROOT_DIR = cand
            break
if not (ROOT_DIR / "src" / "lovli").exists():
    raise FileNotFoundError(
        "lovli package not found. Run the setup cell above first (clone + pip install), "
        "then Runtime > Restart runtime, then run from this cell."
    )
if str(ROOT_DIR / "src") not in sys.path:
    sys.path.insert(0, str(ROOT_DIR / "src"))

from lovli.chain import LegalRAGChain
from lovli.config import get_settings

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Paste credentials here for quick Colab runs.
OPENROUTER_API_KEY = ""  # e.g. "sk-or-v1-..." (optional for sweep, but keep non-empty)
QDRANT_URL = ""          # e.g. "https://<cluster>.cloud.qdrant.io:6333"
QDRANT_API_KEY = ""      # e.g. "qdrant_..."

# Apply pasted credentials if provided; otherwise keep existing env/.env values.
if OPENROUTER_API_KEY.strip():
    os.environ["OPENROUTER_API_KEY"] = OPENROUTER_API_KEY.strip()
elif not os.environ.get("OPENROUTER_API_KEY"):
    os.environ["OPENROUTER_API_KEY"] = "sweep-placeholder-not-used"

if QDRANT_URL.strip():
    os.environ["QDRANT_URL"] = QDRANT_URL.strip()
if QDRANT_API_KEY.strip():
    os.environ["QDRANT_API_KEY"] = QDRANT_API_KEY.strip()

if not os.environ.get("QDRANT_URL") or not os.environ.get("QDRANT_API_KEY"):
    raise ValueError(
        "Missing Qdrant credentials. Paste QDRANT_URL and QDRANT_API_KEY in this cell or set them in .env."
    )

QUESTIONS_PATH = ROOT_DIR / "eval" / "questions.jsonl"
OUT_PATH = ROOT_DIR / "eval" / "retrieval_sweep_results.json"

# Optional: set to an int like 20 for a quick dry run
SAMPLE_SIZE = None

In [None]:
# Reuse production sweep helpers so notebook and script stay in parity.
from scripts.sweep_retrieval_thresholds import (
    apply_combo_to_chain,
    evaluate_combo,
    load_questions,
    precompute_candidates,
)

print("Loaded sweep helpers from scripts/sweep_retrieval_thresholds.py")

In [None]:
# Preferred Colab execution path: run production sweep script directly.
import subprocess

# Skip full index scroll validation in Colab for speed.
os.environ["SWEEP_SKIP_INDEX_SCAN"] = "0"
if SAMPLE_SIZE:
    os.environ["SWEEP_SAMPLE_SIZE"] = str(SAMPLE_SIZE)
else:
    os.environ.pop("SWEEP_SAMPLE_SIZE", None)

result = subprocess.run(
    ["python", "scripts/sweep_retrieval_thresholds.py"],
    capture_output=True,
    text=True,
)
print("RETURN CODE:", result.returncode)
print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)
result.check_returncode()

with open(OUT_PATH, "r", encoding="utf-8") as f:
    rows = json.load(f)

print(f"Saved: {OUT_PATH}")
print("Top 5 configurations:")
for row in rows[:5]:
    print(row)

df = pd.DataFrame(rows)
df.head(10)

## Notes

- For a quick pass, set `SAMPLE_SIZE = 20` first.
- For final tuning, set `SAMPLE_SIZE = None`.
- The execution cell runs the production script directly to avoid notebook drift.
- `SWEEP_SKIP_INDEX_SCAN=0` is set in-notebook to include index validation.
- Copy the best row values into your `.env` or `Settings` defaults after validation.