# VLTL-Bench Evaluation Notebook

This notebook evaluates NL-to-LTL translation on the VLTL-Bench datasets.




In [1]:

import os, json, pathlib, importlib
from pprint import pprint
import pandas as pd

# Path where this notebook lives
NOTEBOOK_DIR = pathlib.Path.cwd()


In [None]:

# ---- User parameters ----
DATASET_DIR = pathlib.Path("path/to/VLTL_Bench")

# Number of examples to evaluate per dataset
N_EXAMPLES   = 500
# Frameworks to test
FRAMEWORKS   = ["NL2TL"]   # "nl2spec", "NL2TL"


In [3]:

# ---- Locate dataset files ----
if not DATASET_DIR.exists():
    raise FileNotFoundError(DATASET_DIR)

dataset_files = sorted(p for p in DATASET_DIR.glob("*.jsonl"))
print(f"Found {len(dataset_files)} datasets:")
for p in dataset_files:
    print("  -", p.name)


Found 3 datasets:
  - search_and_rescue.jsonl
  - traffic_light.jsonl
  - warehouse.jsonl


In [None]:
import os
import json
import pandas as pd
import difflib


# Paths (adjust these as needed)
eval_root = "lifting_eval"
test_root = "VLTL-Bench/test"

# Gather model and dataset names
models = sorted([d for d in os.listdir(eval_root) if os.path.isdir(os.path.join(eval_root, d))])
datasets = sorted([os.path.splitext(f)[0] for f in os.listdir(test_root) if f.endswith(".jsonl")])

# Initialize DataFrame for scores
sim_df = pd.DataFrame(index=models, columns=datasets, dtype=float)

# Compute average for first 500 entries, up to you
for model in models:
    for ds in datasets:
        eval_file = os.path.join(eval_root, model, f"{ds}.jsonl")
        test_file = os.path.join(test_root, f"{ds}.jsonl")
        preds, golds = [], []
        
        # Load predicted lifted sentences
        with open(eval_file, 'r') as f:
            for i, line in enumerate(f):
                if i >= 500: break
                data = json.loads(line)
                preds.append(" ".join(data.get("grounded_sentence", [])))
        
        # Load gold lifted sentences
        with open(test_file, 'r') as f:
            for i, line in enumerate(f):
                if i >= 500: break
                data = json.loads(line)
                golds.append(" ".join(data.get("lifted_sentence", [])))
        
        # Calculate sequence matcher ratio for each pair
        ratios = [
            difflib.SequenceMatcher(None, p, g).ratio()
            for p, g in zip(preds, golds)
        ]
        
        # Store mean ratio
        sim_df.at[model, ds] = sum(ratios) / len(ratios) if ratios else None

# Display the IoU table
display(sim_df)

Unnamed: 0,search_and_rescue,traffic_light,warehouse
gpt-3.5-turbo,0.652687,0.593536,0.679585
gpt-4.1-mini,0.944213,0.966296,0.931708
gpt-4o-mini,0.667212,0.631292,0.689837


In [None]:
import os
import json
import pandas as pd
import difflib
from IPython.display import display

# ─── ADJUST THIS ──────────────────────────────────────────────────────────────
root = "path/to/translation/eval"
frameworks = ["nl2ltl", "nl2spec"]
max_entries = 500
# ────────────────────────────────────────────────────────────────────────────────

for fw in frameworks:
    fw_dir = os.path.join(root, fw)
    if not os.path.isdir(fw_dir):
        raise FileNotFoundError(f"Framework folder not found: {fw_dir}")
    
    # 1) find eval‐types
    eval_types = sorted([
        d for d in os.listdir(fw_dir)
        if os.path.isdir(os.path.join(fw_dir, d))
    ])
    
    # 2) find translation models under the first eval‐type
    first_et = eval_types[0]
    sample_et_dir = os.path.join(fw_dir, first_et)
    trans_models = sorted([
        d for d in os.listdir(sample_et_dir)
        if os.path.isdir(os.path.join(sample_et_dir, d))
    ])
    
    # 3) find dataset names from the first translation-model directory
    sample_ds_dir = os.path.join(sample_et_dir, trans_models[0])
    datasets = sorted([
        os.path.splitext(f)[0]
        for f in os.listdir(sample_ds_dir)
        if f.endswith(".jsonl")
    ])
    
    # 4) build MultiIndex and empty DataFrames
    index = pd.MultiIndex.from_product(
        [eval_types, trans_models],
        names=["eval_type", "trans_model"]
    )
    acc_df = pd.DataFrame(index=index, columns=datasets, dtype=float)
    sim_df = pd.DataFrame(index=index, columns=datasets, dtype=float)
    
    # 5) fill in metrics
    for et in eval_types:
        for tm in trans_models:
            tm_dir = os.path.join(fw_dir, et, tm)
            for ds in datasets:
                path = os.path.join(tm_dir, f"{ds}.jsonl")
                if not os.path.isfile(path):
                    continue
                preds, targets = [], []
                with open(path) as f:
                    for i, line in enumerate(f):
                        if i >= max_entries:
                            break
                        ent = json.loads(line)

                        if "masked_tl" in ent:
                            tgt = " ".join(ent["masked_tl"])
                        elif et=="raw_nl":
                            tgt = " ".join(ent.get("tl", []))
                        pred = ent.get("prediction", "").strip()
                        # strip leading “1.” or “2.” if present
                        parts = pred.split(" ", 1)
                        if parts[0].rstrip(".").isdigit() and len(parts) > 1:
                            pred = parts[1]
                        preds.append(pred)
                        targets.append(tgt)
                if not targets:
                    continue
                # binary accuracy
                acc_df.at[(et, tm), ds] = sum(p == t for p, t in zip(preds, targets)) / len(targets)
                # average sequence‐matching ratio
                ratios = [
                    difflib.SequenceMatcher(None, p, t).ratio()
                    for p, t in zip(preds, targets)
                ]
                sim_df.at[(et, tm), ds] = sum(ratios) / len(ratios)
    
    # 6) display
    # print(f"\n\n✅ {fw.upper()} Translation — Binary Accuracy")
    # display(acc_df)
    print(f"✅ {fw.upper()} Lifted Translation Accuracy")
    display(sim_df)


✅ NL2LTL Lifted Translation Accuracy


Unnamed: 0_level_0,Unnamed: 1_level_0,GLTL,cleanup_world,conformal,navi,search_and_rescue,traffic_light,warehouse
eval_type,trans_model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gt_masked_nl,nl2ltl_gpt-3.5-turbo,0.731009,0.753702,0.609977,0.607692,0.57157,0.539246,0.579585
gt_masked_nl,nl2ltl_gpt-4.1-mini,0.577988,,0.576665,0.541025,0.449712,0.834634,0.455536
gt_masked_nl,nl2ltl_gpt-4o-mini,0.791746,0.797404,0.721603,0.648592,0.639463,0.634919,0.611368
llm_masked_nl,nl2ltl_gpt-3.5-turbo,0.298048,0.300768,0.504065,0.378565,0.690259,0.807079,0.771055
llm_masked_nl,nl2ltl_gpt-4.1-mini,0.555296,0.633278,0.673554,0.609555,0.864507,0.86499,0.827534
llm_masked_nl,nl2ltl_gpt-4o-mini,0.676144,0.688268,0.696453,0.641954,0.691281,0.675532,0.664818
raw_nl,nl2ltl_gpt-3.5-turbo,0.733297,0.248033,0.383111,0.296605,0.581645,0.525105,0.571979
raw_nl,nl2ltl_gpt-4.1-mini,0.671283,0.589388,0.595104,0.642422,0.844912,0.836368,0.837575
raw_nl,nl2ltl_gpt-4o-mini,0.794853,0.796485,0.417352,0.646165,0.677505,0.63766,0.613745


✅ NL2SPEC Lifted Translation Accuracy


Unnamed: 0_level_0,Unnamed: 1_level_0,GLTL,cleanup_world,conformal,navi,search_and_rescue,traffic_light,warehouse
eval_type,trans_model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gt_masked_nl,nl2spec_gpt-3.5-turbo,0.203417,0.237886,0.188802,0.345011,0.276003,0.273147,0.28027
gt_masked_nl,nl2spec_gpt-4.1-mini,0.362687,0.292539,0.393168,0.408544,0.496406,0.501013,0.513144
gt_masked_nl,nl2spec_gpt-4o-mini,0.37057,0.33177,0.292036,0.286237,0.290943,0.28682,0.278027
llm_masked_nl,nl2spec_gpt-3.5-turbo,0.391836,0.407691,0.270221,0.374474,0.230367,0.205939,0.250685
llm_masked_nl,nl2spec_gpt-4.1-mini,0.393381,0.396317,0.433627,0.446286,0.504459,0.522979,0.484946
llm_masked_nl,nl2spec_gpt-4o-mini,0.423226,0.416372,0.370187,0.396023,0.325893,0.313839,0.324999
raw_nl,nl2spec_gpt-3.5-turbo,0.214798,0.314115,0.255639,0.262233,0.23214,0.256174,0.283846
raw_nl,nl2spec_gpt-4.1-mini,0.358538,0.29108,0.376967,0.407149,0.496769,0.500626,0.524416
raw_nl,nl2spec_gpt-4o-mini,0.369935,0.333859,0.291975,0.286616,0.277219,0.272202,0.277302


In [None]:
import os
import json
import pandas as pd
import difflib
from IPython.display import display

# ─── CONFIG ──────────────────────────────────────────────────────────────
root = "path/to/nl2tl eval"
eval_types = ["LLM_masked_nl", "gt_lifting", "raw_nl"]
max_entries = 500
# ─────────────────────────────────────────────────────────────────────────

# discover dataset names from the first eval_type
datasets = sorted([
    os.path.splitext(f)[0]
    for f in os.listdir(os.path.join(root, eval_types[0]))
    if f.endswith(".jsonl")
])

# initialize DataFrames
acc_df = pd.DataFrame(index=eval_types, columns=datasets, dtype=float)
sim_df = pd.DataFrame(index=eval_types, columns=datasets, dtype=float)

def best_substring_similarity(prediction: str, target: str) -> float:
    """
    Return the highest SequenceMatcher ratio between `target`
    and any substring of `prediction` of length len(target).
    If prediction is shorter than target, compare whole strings.
    """
    sm = difflib.SequenceMatcher
    t_len, p_len = len(target), len(prediction)
    if p_len < t_len:
        return sm(None, prediction, target).ratio()
    best = 0.0
    for i in range(p_len - t_len + 1):
        sub = prediction[i : i + t_len]
        best = max(best, sm(None, sub, target).ratio())
    return best

# compute metrics
for et in eval_types:
    et_dir = os.path.join(root, et)
    for ds in datasets:
        file_path = os.path.join(et_dir, f"{ds}.jsonl")
        if not os.path.isfile(file_path):
            continue

        preds, targets = [], []
        with open(file_path) as f:
            for i, line in enumerate(f):
                if i >= max_entries:
                    break
                ent = json.loads(line)
                # target = grounded_sentence if present, else raw sentence
                if "masked_tl" in ent:
                    tgt = " ".join(ent["masked_tl"])
                elif et=="raw_nl":
                    tgt = " ".join(ent.get("tl", []))
                # clean prediction
                pred = ent.get("prediction", "").strip()
                parts = pred.split(" ", 1)
                if parts[0].rstrip(".").isdigit() and len(parts) > 1:
                    pred = parts[1]
                preds.append(pred)
                targets.append(tgt)

        if not targets:
            continue

        # binary accuracy
        acc_df.at[et, ds] = sum(p == t for p, t in zip(preds, targets)) / len(targets)
        # substring-based similarity
        sim_df.at[et, ds] = sum(
            best_substring_similarity(p, t)
            for p, t in zip(preds, targets)
        ) / len(targets)


print("✅ NL2TL Translation Accuracy")
display(sim_df)


✅ NL2TL Translation Accuracy


Unnamed: 0,GLTL,cleanup_world,conformal,navi,search_and_rescue,traffic_light,warehouse
bert_masked_nl,0.345215,0.313268,0.320866,0.299742,0.375832,0.373162,0.379877
gt_lifting,0.998685,0.999563,0.928629,0.997828,1.0,1.0,1.0
raw_nl,0.717932,0.709023,0.65708,0.642853,0.628225,0.586153,0.655364


In [21]:
# Jupyter notebook cell: Verification evaluation with parser-error handling

import json
import re
from pathlib import Path
from typing import List, Set, Union
from tqdm import tqdm
from functools import lru_cache
from pyModelChecking.LTL import Parser, AtomicProposition as AP, Not, And, Or, Imply, X, F, G, U
import pandas as pd

# ----------------------------------------------------------------------------
# 1 — Normalisation / implication elimination
# ----------------------------------------------------------------------------
TOKEN_MAP = {
    "globally": "G", "always": "G", "[]": "G",
    "finally": "F", "eventually": "F", "<>": "F",
    "next": "X", "until": "U",
    "not": "not", "¬": "not", "!": "not", 
    "&": "and", "∧": "and",
    "|": "or", "∨": "or", "or": "or",
    "imply": "-->", "implies": "-->", "->": "-->",
    "⇒": "-->",
    "double_implies": "-->"
}
_PARSER = Parser()
_AP_OK = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")

def _normalise_tokens(tokens: List[str]) -> str:
    out = []
    for t in tokens:
        low = t.lower()
        if low in TOKEN_MAP:
            out.append(TOKEN_MAP[low])
        elif t in ("(", ")"):
            out.append(t)
        elif _AP_OK.match(t):
            out.append(t)
        else:
            out.append(f"'{t}'")
    return " ".join(out)

def _elim_impl_tokens(tokens: List[str]) -> List[str]:
    while True:
        depth = [0] * len(tokens)
        d = 0
        for i, tok in enumerate(tokens):
            if tok == "(":
                d += 1
            depth[i] = d
            if tok == ")":
                d -= 1
        for i, tok in enumerate(tokens):
            if tok not in ("->", "-->"):
                continue
            di = depth[i]
            j = i - 1
            while j >= 0 and depth[j] >= di:
                j -= 1
            lhs_start = j + 1
            k = i + 1
            while k < len(tokens) and depth[k] >= di:
                k += 1
            rhs_end = k
            lhs = tokens[lhs_start:i]
            rhs = tokens[i+1:rhs_end]
            new = ["(", "(", "not"] + lhs + [")", "or", "("] + rhs + [")", ")"]
            tokens = tokens[:lhs_start] + new + tokens[rhs_end:]
            break
        else:
            return tokens

@lru_cache(maxsize=16384)
def _parse(formula_str: str):
    return _PARSER(formula_str)

def _eval(ast, trace: List[Set[str]], t: int = 0) -> bool:
    if isinstance(ast, AP):
        return str(ast) in trace[t]
    if isinstance(ast, Not):
        return not _eval(ast.subformula(0), trace, t)
    if isinstance(ast, And):
        return _eval(ast.subformula(0), trace, t) and _eval(ast.subformula(1), trace, t)
    if isinstance(ast, Or):
        return _eval(ast.subformula(0), trace, t) or _eval(ast.subformula(1), trace, t)
    if isinstance(ast, Imply):
        return (not _eval(ast.subformula(0), trace, t)) or _eval(ast.subformula(1), trace, t)
    if isinstance(ast, X):
        return _eval(ast.subformula(0), trace, min(t+1, len(trace)-1))
    if isinstance(ast, F):
        return any(_eval(ast.subformula(0), trace, k) for k in range(t, len(trace)))
    if isinstance(ast, G):
        return all(_eval(ast.subformula(0), trace, k) for k in range(t, len(trace)))
    if isinstance(ast, U):
        φ, ψ = ast.subformula(0), ast.subformula(1)
        for k in range(t, len(trace)):
            if _eval(ψ, trace, k):
                return all(_eval(φ, trace, j) for j in range(t, k))
        return False
    raise NotImplementedError(f"Unsupported AST node: {type(ast)}")

def _tokenise(tokens: Union[List[str], str]) -> List[str]:
    if isinstance(tokens, str):
        return re.findall(r"\w+|[()]", tokens)
    return tokens

# ----------------------------------------------------------------------------
# 2 — Load ground-truth test entries
# ----------------------------------------------------------------------------
test_set_dir = Path("VLTL-Bench/test")
datasets = ["search_and_rescue", "traffic_light", "warehouse"]
test_entries = {}
for ds in datasets:
    m = {}
    with open(test_set_dir/f"{ds}.jsonl") as f:
        for line in f:
            e = json.loads(line)
            m[e["id"]] = e
    test_entries[ds] = m

# ----------------------------------------------------------------------------
# 3 — Run evaluation with parser-error handling
# ----------------------------------------------------------------------------
base_eval_dir = Path("translation_eval")
results = []

for fw_dir in base_eval_dir.iterdir():
    if not fw_dir.is_dir(): 
        continue
    framework = fw_dir.name

    # nl2tl structure
    if framework == "nl2tl":
        # continue
        for lift_dir in fw_dir.iterdir():
            lifting = lift_dir.name
            for ds in datasets:
                file = lift_dir/f"{ds}.jsonl"
                if not file.exists():
                    continue
                total = ok_good = ok_bad = ok_both = 0
                for line in tqdm(file.open(), desc=f"{framework}/{lifting}/{ds}"):
                    e = json.loads(line)
                    gt = test_entries[ds][e["id"]]
                    mapping = {
                        pid: f"{info['action_canon']}({','.join(info.get('args_canon',[]))})"
                        for pid,info in gt["prop_dict"].items()
                    }
                    rev = {v:k for k,v in mapping.items()}
                    to_labels = lambda raw: [{rev.get(ap,ap) for ap in step} for step in raw]
                    good, bad = to_labels(gt["good_trace"]), to_labels(gt["bad_trace"])
                    phi = e["prediction"]
                    if type(phi) == List: phi = "".join(phi)
                    

                    # ---- build a clean LTL string ----
                    tokens   = _tokenise(phi)                    # list of word‐tokens
                    norm_str  = _normalise_tokens(tokens)        # e.g. "globally ( prop_1 implies … )"
                    toks      = norm_str.split()                 # back to list
                    elim      = _elim_impl_tokens(toks)          # impl‐elim
                    f_str     = " ".join(elim)                   # final formula string
                    try:
                        ast       = _parse(f_str)                    # parse AST
                                                # ---- evaluate ----
                        good_sat = _eval(ast, good)
                        bad_sat  = _eval(ast, bad)

                        if good_sat:
                            ok_good += 1
                        if not bad_sat:
                            ok_bad += 1
                        if good_sat and not bad_sat:
                            ok_both += 1
                    except Exception:
                        bad_parse +=1 

                    total += 1
                    # print(bad_parse)
                results.append((
                    framework, lifting, model, ds, total,
                    ok_good/total, ok_bad/total, ok_both/total
                ))

    else:
        for lift_dir in fw_dir.iterdir():
            lifting = lift_dir.name
            for model_dir in lift_dir.iterdir():
                model = model_dir.name
                if '4.1-mini' not in model:
                    continue
                print(model)
                for ds in datasets:
                    file = model_dir / f"{ds}.jsonl"
                    if not file.exists():
                        continue

                    total = ok_good = ok_bad = ok_both = bad_parse = 0
                    for line in tqdm(file.open(), desc=f"{framework}/{lifting}/{model}/{ds}"):
                        e = json.loads(line)
                        gt = test_entries[ds][e["id"]]

                        # rebuild prop->atom mapping
                        mapping = {
                            pid: f"{info['action_canon']}({','.join(info.get('args_canon', []))})"
                            for pid, info in gt["prop_dict"].items()
                        }
                        rev_map = {atom: pid for pid, atom in mapping.items()}
                        to_labels = lambda raw: [{rev_map.get(ap, ap) for ap in step} for step in raw]
                        good, bad = to_labels(gt["good_trace"]), to_labels(gt["bad_trace"])

                        # strip any ChatGPT prefixes/suffixes
                        phi = e["prediction"]
                        if phi.startswith('LTL:'):
                            phi = phi[4:]
                        if phi.startswith('3. *FINAL:* '):
                            phi = phi[12:]
                        for suffix in ('*FINISH*', 'FINISH'):
                            if phi.endswith(suffix):
                                phi = phi[: -len(suffix)]

                        # ---- build a clean LTL string ----
                        tokens   = _tokenise(phi)                    # list of word‐tokens
                        norm_str  = _normalise_tokens(tokens)        # e.g. "globally ( prop_1 implies … )"
                        toks      = norm_str.split()                 # back to list
                        elim      = _elim_impl_tokens(toks)          # impl‐elim
                        f_str     = " ".join(elim)                   # final formula string
                        try:
                            ast       = _parse(f_str)                    # parse AST
                                                    # ---- evaluate ----
                            good_sat = _eval(ast, good)
                            bad_sat  = _eval(ast, bad)

                            if good_sat:
                                ok_good += 1
                            if not bad_sat:
                                ok_bad += 1
                            if good_sat and not bad_sat:
                                ok_both += 1
                        except Exception:
                            bad_parse +=1 

                        total += 1
                        # print(bad_parse)
                    results.append((
                        framework, lifting, model, ds, total,
                        ok_good/total, ok_bad/total, ok_both/total
                    ))


                        # results.append((framework, lifting, model, ds, total,
                        #                 ok_good/total, ok_bad/total, ok_both/total))

# Summarize
columns = ["framework","lifting","model","dataset","total",
           "ok_good(%)","ok_bad(%)","ok_both(%)"]
df = pd.DataFrame(results, columns=columns)
df


nl2ltl_gpt-4.1-mini


nl2ltl/gt_masked_nl/nl2ltl_gpt-4.1-mini/search_and_rescue: 500it [00:00, 28858.96it/s]
nl2ltl/gt_masked_nl/nl2ltl_gpt-4.1-mini/traffic_light: 500it [00:00, 35161.16it/s]
nl2ltl/gt_masked_nl/nl2ltl_gpt-4.1-mini/warehouse: 500it [00:00, 32273.31it/s]


nl2ltl_gpt-4.1-mini


nl2ltl/raw_nl/nl2ltl_gpt-4.1-mini/search_and_rescue: 500it [00:00, 41596.95it/s]
nl2ltl/raw_nl/nl2ltl_gpt-4.1-mini/traffic_light: 500it [00:00, 44155.22it/s]
nl2ltl/raw_nl/nl2ltl_gpt-4.1-mini/warehouse: 500it [00:00, 43201.94it/s]


nl2ltl_gpt-4.1-mini


nl2ltl/llm_masked_nl/nl2ltl_gpt-4.1-mini/search_and_rescue: 500it [00:00, 33527.07it/s]
nl2ltl/llm_masked_nl/nl2ltl_gpt-4.1-mini/traffic_light: 500it [00:00, 39667.70it/s]
nl2ltl/llm_masked_nl/nl2ltl_gpt-4.1-mini/warehouse: 500it [00:00, 41457.16it/s]
nl2tl/raw_nl/search_and_rescue: 500it [00:00, 25163.51it/s]
nl2tl/raw_nl/traffic_light: 500it [00:00, 26138.92it/s]
nl2tl/raw_nl/warehouse: 500it [00:00, 24994.06it/s]
nl2tl/gt_lifting/search_and_rescue: 500it [00:00, 44330.69it/s]
nl2tl/gt_lifting/traffic_light: 500it [00:00, 52791.74it/s]
nl2tl/gt_lifting/warehouse: 500it [00:00, 46853.26it/s]


nl2spec_gpt-4.1-mini


nl2spec/gt_masked_nl/nl2spec_gpt-4.1-mini/search_and_rescue: 500it [00:00, 17978.77it/s]
nl2spec/gt_masked_nl/nl2spec_gpt-4.1-mini/traffic_light: 500it [00:00, 21163.05it/s]
nl2spec/gt_masked_nl/nl2spec_gpt-4.1-mini/warehouse: 500it [00:00, 23947.47it/s]


nl2spec_gpt-4.1-mini


nl2spec/raw_nl/nl2spec_gpt-4.1-mini/search_and_rescue: 500it [00:00, 22009.26it/s]
nl2spec/raw_nl/nl2spec_gpt-4.1-mini/traffic_light: 500it [00:00, 22718.58it/s]
nl2spec/raw_nl/nl2spec_gpt-4.1-mini/warehouse: 500it [00:00, 25197.98it/s]


nl2spec_gpt-4.1-mini


nl2spec/llm_masked_nl/nl2spec_gpt-4.1-mini/search_and_rescue: 500it [00:00, 25899.08it/s]
nl2spec/llm_masked_nl/nl2spec_gpt-4.1-mini/traffic_light: 500it [00:00, 28145.91it/s]
nl2spec/llm_masked_nl/nl2spec_gpt-4.1-mini/warehouse: 500it [00:00, 27425.94it/s]


Unnamed: 0,framework,lifting,model,dataset,total,ok_good(%),ok_bad(%),ok_both(%)
0,nl2ltl,gt_masked_nl,nl2ltl_gpt-4.1-mini,search_and_rescue,500,0.106,0.32,0.074
1,nl2ltl,gt_masked_nl,nl2ltl_gpt-4.1-mini,traffic_light,500,0.618,0.592,0.366
2,nl2ltl,gt_masked_nl,nl2ltl_gpt-4.1-mini,warehouse,500,0.124,0.362,0.098
3,nl2ltl,raw_nl,nl2ltl_gpt-4.1-mini,search_and_rescue,500,0.616,0.614,0.354
4,nl2ltl,raw_nl,nl2ltl_gpt-4.1-mini,traffic_light,500,0.646,0.602,0.384
5,nl2ltl,raw_nl,nl2ltl_gpt-4.1-mini,warehouse,500,0.524,0.586,0.262
6,nl2ltl,llm_masked_nl,nl2ltl_gpt-4.1-mini,search_and_rescue,500,0.496,0.588,0.318
7,nl2ltl,llm_masked_nl,nl2ltl_gpt-4.1-mini,traffic_light,500,0.532,0.598,0.362
8,nl2ltl,llm_masked_nl,nl2ltl_gpt-4.1-mini,warehouse,500,0.448,0.566,0.264
9,nl2tl,raw_nl,nl2ltl_gpt-3.5-turbo,search_and_rescue,500,0.114,0.114,0.114


In [24]:
def are_strings_similar(str1, str2, max_diff):
    """
    Checks if two strings are the same within a maximum difference 
    in the number of characters.

    Args:
        str1: The first string.
        str2: The second string.
        max_diff: The maximum allowed difference in characters.

    Returns:
        True if the strings are similar within the max_diff, False otherwise.
    """
    if abs(len(str1) - len(str2)) > max_diff:
        return False

    diff_count = 0
    min_len = min(len(str1), len(str2))

    for i in range(min_len):
        if str1[i] != str2[i]:
            diff_count += 1
    
    diff_count += abs(len(str1) - len(str2))

    return diff_count <= max_diff

# Example usage
string1 = "apple"
string2 = "aplle"
max_difference = 1
result = are_strings_similar(string1, string2, max_difference)
print(f"Strings '{string1}' and '{string2}' are similar: {result}")

string3 = "banana"
string4 = "bananas"
max_difference = 1
result = are_strings_similar(string3, string4, max_difference)
print(f"Strings '{string3}' and '{string4}' are similar: {result}")

string5 = "grape"
string6 = "fruit"
max_difference = 2
result = are_strings_similar(string5, string6, max_difference)
print(f"Strings '{string5}' and '{string6}' are similar: {result}")

Strings 'apple' and 'aplle' are similar: True
Strings 'banana' and 'bananas' are similar: True
Strings 'grape' and 'fruit' are similar: False


In [26]:
# Jupyter evaluation segment: exact‐match and prop‐level accuracy with fence‐stripping

import json
from pathlib import Path
import pandas as pd

# Adjust these paths if needed
RESULTS_DIR = Path("grounding_eval")
TEST_DIR    = Path("VLTL-Bench/test")

# 1. Load ground‑truth prop_dicts
test_data = {}
for ds_file in TEST_DIR.glob("*.jsonl"):
    ds_name = ds_file.stem
    entries = [json.loads(line) for line in ds_file.open("r")]
    test_data[ds_name] = {entry["id"]: entry["prop_dict"] for entry in entries}

# 2. Parse model outputs and strip code fences
records = []
for model_dir in RESULTS_DIR.iterdir():
    if not model_dir.is_dir():
        continue
    model_name = model_dir.name
    for result_file in model_dir.glob("*.jsonl"):
        stem = result_file.stem              # e.g. "search_and_rescue_base"
        dataset, prompt_type = stem.rsplit("_", 1)
        responses = [json.loads(line) for line in result_file.open("r")]

        for resp in responses:
            # extract test-entry ID from custom_id: "dataset-model-entryid"
            cid = resp["custom_id"]
            entry_id = int(cid.split("-")[-1])

            # raw assistant content
            raw = resp["response"]["body"]["choices"][0]["message"]["content"]

            # strip fences and prefixes
            clean = raw.strip()
            if clean.startswith("```"):
                lines = clean.splitlines()
                # drop leading fence line
                lines = lines[1:]
                # drop trailing fence if present
                if lines and lines[-1].strip().startswith("```"):
                    lines = lines[:-1]
                clean = "\n".join(lines)
            # remove any "prop_dict:" prefix before JSON
            if clean.lstrip().startswith("prop_dict"):
                idx = clean.find("{")
                clean = clean[idx:]

            # parse JSON
            try:
                pred_dict = json.loads(clean)
            except json.JSONDecodeError:
                pred_dict = None

            # ground truth
            gt_dict = test_data.get(dataset, {}).get(entry_id, {})

            # exact‐match?
            exact = (pred_dict == gt_dict)

            # prop‐level correctness
            total_props   = len(gt_dict)
            correct_props = 0
            if isinstance(pred_dict, dict):
                for key, val in gt_dict.items():
                    if pred_dict.get(key) == val:
                        correct_props += 1

            records.append({
                "model":         model_name,
                "dataset":       dataset,
                "prompt":        prompt_type,
                "id":            entry_id,
                "exact_match":   exact,
                "correct_props": correct_props,
                "total_props":   total_props
            })

# 3. Build DataFrame and compute accuracies
df = pd.DataFrame(records)

# a) Entry‐level exact match accuracy
exact_acc = (
    df
    .groupby(["model", "prompt", "dataset"])["exact_match"]
    .mean()
    .reset_index()
    .rename(columns={"exact_match": "exact_match_accuracy"})
)

# b) Prop‐level accuracy (micro average across all props)
prop_acc = (
    df
    .groupby(["model", "prompt", "dataset"])
    .sum()[["correct_props", "total_props"]]
    .assign(prop_accuracy=lambda x: x["correct_props"] / x["total_props"])
    .reset_index()[["model", "prompt", "dataset", "prop_accuracy"]]
)

# Merge both metrics
accuracy = exact_acc.merge(prop_acc, on=["model", "prompt", "dataset"])

accuracy


Unnamed: 0,model,prompt,dataset,exact_match_accuracy,prop_accuracy
0,3_5_turbo,base,search_and_rescue,0.342,0.569524
1,3_5_turbo,base,traffic_light,0.514,0.695441
2,3_5_turbo,base,warehouse,0.074,0.182785
3,3_5_turbo,scenario,search_and_rescue,0.636,0.766667
4,3_5_turbo,scenario,traffic_light,0.208,0.372454
5,3_5_turbo,scenario,warehouse,0.05,0.136364
6,4_1_mini,base,search_and_rescue,0.604,0.773333
7,4_1_mini,base,traffic_light,0.458,0.674103
8,4_1_mini,base,warehouse,0.078,0.237911
9,4_1_mini,scenario,search_and_rescue,0.452,0.686667
