In [1]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip -q install -U prometheus-eval vllm transformers accelerate pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.2/438.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.0/180.0 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os, pandas as pd
from tqdm import tqdm

# ⚠️ 네 드라이브 경로 맞추기
BASE = "/content/drive/MyDrive/404DNF"

# 평가 대상 CSV들 (열: String, Type, predicate 필요)
FILES = {
    "template":   f"{BASE}/template_predicate.csv",
    "contextual": f"{BASE}/contextual_predicate.csv",
    "paraphrase": f"{BASE}/paraphrase_predicate.csv",
}

# Ground Truth: 최소 [String, Type, Predicate] (대문자 주의)
GT_CSV = f"{BASE}/predicate_GT.csv"

for k, v in FILES.items():
    print(k, os.path.exists(v), v)
print("GT:", os.path.exists(GT_CSV), GT_CSV)

template True /content/drive/MyDrive/404DNF/template_predicate.csv
contextual True /content/drive/MyDrive/404DNF/contextual_predicate.csv
paraphrase True /content/drive/MyDrive/404DNF/paraphrase_predicate.csv
GT: True /content/drive/MyDrive/404DNF/predicate_GT.csv


In [4]:
import pandas as pd

csv_path = "/content/drive/MyDrive/404DNF/template_predicate.csv"  # 네 csv 경로
df = pd.read_csv(csv_path)

# Type 별 개수
print("=== Type 분포 ===")
print(df["Type"].value_counts())

# Predicate 별 개수
print("\n=== Predicate 분포 ===")
print(df["predicate"].value_counts())

# Type-Predicate 교차 분포 (피벗 형태)
print("\n=== Type-Predicate 교차표 ===")
print(df.groupby(["Type", "predicate"]).size())

=== Type 분포 ===
Type
Not Dark Pattern    1600
Social Proof         400
Urgency              400
Misdirection         400
Scarcity             400
Name: count, dtype: int64

=== Predicate 분포 ===
predicate
Activity Notifications              361
Low-stock Messages                  360
Limited-time Messages               240
Countdown Timers                    160
Pressured Selling                   157
Confirmshaming                      137
Trick Questions                     106
High-demand Messages                 40
Testimonials of Uncertain Origin     39
Name: count, dtype: int64

=== Type-Predicate 교차표 ===
Type          predicate                       
Misdirection  Confirmshaming                      137
              Pressured Selling                   157
              Trick Questions                     106
Scarcity      High-demand Messages                 40
              Low-stock Messages                  360
Social Proof  Activity Notifications              361
           

In [5]:
DEFINITIONS = {
    "Urgency": {
        "Definition": (
            "Urgency dark patterns pressure users by limiting available time, which reduces their ability to "
            "carefully evaluate information and may cause stress or anxiety. This pressure can be exploited to push "
            "users into actions not in their best interest."
        ),
        "Predicates": {
            "Countdown Timers": "A visible timer showing that a deal or discount will expire soon.",
            "Limited-time Messages": "Claims that a deal or sale will end 'soon' without providing a clear deadline.",
        },
    },
    "Misdirection": {
        "Definition": (
            "Misdirection manipulates user attention by distracting or confusing them. It emphasizes certain options "
            "while hiding or downplaying others, leading users to make unintended choices."
        ),
        "Predicates": {
            "Confirmshaming": "Uses shame or emotional wording to discourage a certain choice.",
            "Trick Questions": "Uses confusing or ambiguous wording to steer users toward unintended choices.",
            "Pressured Selling": "Preselects or pressures users to accept more expensive product options or add-ons.",
        },
    },
    "Social Proof": {
        "Definition": (
            "Social Proof exploits social cues to influence behavior. It creates the perception that others are "
            "already acting, pressuring users to conform and undermining independent decision-making."
        ),
        "Predicates": {
            "Activity Notifications": "Real or simulated messages like '5 people just bought this' to induce quick action; often exaggerated/fabricated.",
            "Testimonials of Uncertain Origin": "Reviews/ratings/endorsements without reliable sources, designed to build false trust.",
        },
    },
    "Scarcity": {
        "Definition": (
            "Scarcity creates a false or exaggerated sense of limited availability, exploiting fear of missing out "
            "(FOMO) to push rushed decisions."
        ),
        "Predicates": {
            "Low-stock Messages": "Warnings like 'Only 2 left in stock', often exaggerated or fabricated.",
            "High-demand Messages": "Claims like '50 people are viewing this now', creating artificial competition/urgency.",
        },
    },
    "Not Dark Pattern": {
        "Definition": "Content that does not represent any dark pattern in this taxonomy.",
        "Predicates": {"None": "Always used for this Type."},
    },
}

# (단수/표기차) 정규화 맵 — GT 라벨이 정의 키와 다를 때 매칭 보정
PRED_NORMALIZE = {
    "Low-stock Message": "Low-stock Messages",
    "Activity Notification": "Activity Notifications",
    "Countdown Timer": "Countdown Timers",
    "Limited-time Message": "Limited-time Messages",
    "High-demand Message": "High-demand Messages",
    "Testimonial of Uncertain Origin": "Testimonials of Uncertain Origin",
}

# predicate -> type 역매핑
P2T = {}
for _t, spec in DEFINITIONS.items():
    for _p in spec.get("Predicates", {}).keys():
        P2T[_p] = _t

In [6]:
import pandas as pd

def norm_pred(p):
    if isinstance(p, str):
        return PRED_NORMALIZE.get(p, p)
    return p

# GT 불러오기: 기본은 String, Type, Predicate (대문자 시작) 가정하지만,
# 혹시 다르게 와도 자동으로 찾아서 매핑
gt_raw = pd.read_csv(GT_CSV)
gt_cols_lower = {c.strip().lower(): c for c in gt_raw.columns}

def require_col(lower_name, candidates=("string","type","predicate")):
    # lower_name 은 "string"/"type"/"predicate" 중 하나
    # gt_cols_lower 에서 해당 키를 찾되, 유사 후보도 탐색
    if lower_name in gt_cols_lower:
        return gt_cols_lower[lower_name]
    # 후보군으로도 탐색
    cand_map = {
        "string":    ["string", "text", "sentence"],
        "type":      ["type", "label_type", "class", "category"],
        "predicate": ["predicate", "label", "pred", "subtype"]
    }
    for k in cand_map[lower_name]:
        if k in gt_cols_lower:
            return gt_cols_lower[k]
    raise AssertionError(f"GT: required column for '{lower_name}' not found. got columns={list(gt_raw.columns)}")

gt_col_String    = require_col("string")
gt_col_Type      = require_col("type")
gt_col_Predicate = require_col("predicate")

# 표기 차이 정규화
gt_raw[gt_col_Predicate] = gt_raw[gt_col_Predicate].map(norm_pred)

# 같은 String에 여러 라벨이 있으면 '최빈값'으로 정리
pred_mode = gt_raw.groupby(gt_col_String)[gt_col_Predicate].agg(lambda s: s.mode().iat[0])
type_mode = gt_raw.groupby(gt_col_String)[gt_col_Type].agg(lambda s: s.mode().iat[0])

gt = pd.concat([pred_mode, type_mode], axis=1).reset_index()
gt.columns = ["String", "Predicate", "Type"]   # 표준화해두면 이후 일관됨

# 빠른 조회용 딕셔너리 (문장 → GT 라벨/타입)
GT_PRED_BY_STRING = dict(zip(gt["String"], gt["Predicate"]))
GT_TYPE_BY_STRING = dict(zip(gt["String"], gt["Type"]))

def get_predicate_definition(_type, _pred):
    # 정의에서 못 찾으면 정규화 한번 더 시도
    d = DEFINITIONS.get(_type, {}).get("Predicates", {}).get(_pred)
    if d is None:
        d = DEFINITIONS.get(_type, {}).get("Predicates", {}).get(norm_pred(_pred), "")
    return d

def build_reference_answer_for_string(text, fallback_type, fallback_pred):
    """
    String(문장)으로 GT 라벨/타입을 찾아 Reference Answer(자연어) 생성.
    없으면 현재 행의 Type/predicate로 fallback.
    """
    gt_pred = GT_PRED_BY_STRING.get(text, fallback_pred)
    gt_type = GT_TYPE_BY_STRING.get(text, P2T.get(norm_pred(gt_pred), fallback_type))
    pred_def = get_predicate_definition(gt_type, gt_pred)

    ref = (
        "Reference answer (score 5):\n"
        f"- Correct predicate: {gt_pred}\n"
        f"- Reason: This text aligns with the predicate definition: {pred_def}\n"
        f"- Therefore, label = {gt_pred}"
    )
    return ref, pred_def, gt_type, gt_pred

print("GT rows:", len(gt), "| sample:")
display(gt.head(3))

GT rows: 1135 | sample:


Unnamed: 0,String,Predicate,Type
0,"""We'd love to send you emails with offers and ...",Trick Questions,Misdirection
1,"""Your purchase entitles you to the following s...",Pressured Selling,Misdirection
2,#6 Bestseller in Rugs This item was viewed 92 ...,Activity Notifications,Social Proof


In [7]:

from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import ABSOLUTE_PROMPT

judge = PrometheusEval(
    model=VLLM(
        model="prometheus-eval/prometheus-7b-v2.0",
        dtype="float16",
        gpu_memory_utilization=0.70,
        tensor_parallel_size=1,
        enforce_eager=True,
        trust_remote_code=True,
    ),
    absolute_grade_template=ABSOLUTE_PROMPT
)
print("Judge ready.")

INFO 10-30 08:50:57 [__init__.py:216] Automatically detected platform cuda.
INFO 10-30 08:51:07 [utils.py:233] non-default args: {'trust_remote_code': True, 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'enforce_eager': True, 'model': 'prometheus-eval/prometheus-7b-v2.0'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

INFO 10-30 08:51:28 [model.py:547] Resolved architecture: MistralForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-30 08:51:28 [model.py:1510] Using max model len 32768
INFO 10-30 08:51:32 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 10-30 08:51:32 [__init__.py:381] Cudagraph is disabled under eager mode


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

INFO 10-30 08:52:42 [llm.py:306] Supported_tasks: ['generate']
Judge ready.


In [8]:
def build_rubric(_type, pred):
    return (
        "Criteria: Predicate validity.\n"
        f"Type: {_type}\n"
        f"Type definition: {DEFINITIONS.get(_type,{}).get('Definition','(none)')}\n"
        f"Predicate: {pred}\n"
        f"Predicate definition: {DEFINITIONS.get(_type,{}).get('Predicates',{}).get(pred,'(none)')}\n"
        "Score 1: clearly invalid/no evidence\n"
        "Score 3: ambiguous/partial\n"
        "Score 5: strongly valid with clear evidence"
    )

def make_instruction(_type, text):
    return f"Decide if the text fits a specific predicate within the given Type.\n[Type] {_type}\n[Text] {text}"

def make_response(pred):
    return f"Model-chosen predicate: {pred}\nEvidence: (observable cues)"

def judge_batch_with_retry(instructions, responses, references, rubrics, max_depth=4):
    n = len(instructions)
    try:
        fbs, scs = judge.absolute_grade(
            instructions=instructions,
            responses=responses,
            reference_answers=references,
            rubric=rubrics,
            params={}
        )
        return (fbs, scs), []
    except Exception as e:
        if n == 1 or max_depth == 0:
            return (["[ERROR] " + str(e)], [None]), [0]
        mid = n // 2
        (f1, s1), e1 = judge_batch_with_retry(instructions[:mid], responses[:mid], references[:mid], rubrics[:mid], max_depth-1)
        (f2, s2), e2 = judge_batch_with_retry(instructions[mid:], responses[mid:], references[mid:], rubrics[mid:], max_depth-1)
        return (f1+f2, s1+s2), (e1 + [mid+i for i in e2])

In [9]:
def eval_and_merge_one(
    name: str,
    csv_path: str,
    out_suffix: str = "_with_prom.csv",
    batch_size: int = 64,
    skip_nd: bool = True,
    fix_nd: bool  = False,
    max_text_len: int = 512
):
    import pandas as pd, numpy as np, os
    from tqdm import tqdm

    assert os.path.exists(csv_path), f"{csv_path} not found"
    df_orig = pd.read_csv(csv_path)

    # ---- (1) 컬럼명 대소문자/공백 정규화 → 내부 처리 표준: string/type/predicate ----
    df = df_orig.copy()
    df.columns = [c.strip() for c in df.columns]          # 공백 제거
    lower_map = {c.lower(): c for c in df.columns}         # lower -> original

    def pick_col(candidates):
        for c in candidates:
            if c in lower_map:
                return lower_map[c]
        return None

    col_string = pick_col(["string","text","sentence"])
    col_type   = pick_col(["type","label_type","class","category"])
    col_pred   = pick_col(["predicate","label","pred","subtype"])

    need_missing = [n for n,v in {"String":col_string,"Type":col_type,"Predicate":col_pred}.items() if v is None]
    if need_missing:
        raise AssertionError(f"{name}: need columns {need_missing} in {list(df.columns)}")

    # ND 처리 (표기 다양성 대비: 소문자로 비교)
    if skip_nd:
        work = df[df[col_type].astype(str).str.lower() != "not dark pattern"].copy()
    else:
        work = df.copy()
        if fix_nd:
            bad = (work[col_type].astype(str).str.lower() == "not dark pattern") & (work[col_pred] != "None")
            if bad.any():
                work.loc[bad, col_pred] = "None"

    if max_text_len:
        work[col_string] = work[col_string].astype(str).str.slice(0, max_text_len)

    # 유니크 키 (type, string, predicate) 기준으로 한 번만 채점
    key_cols = [col_type, col_string, col_pred]
    work["_key"] = work[key_cols].astype(str).agg("||".join, axis=1)
    uniq_keys = work["_key"].unique().tolist()

    instrs, resps, refs, rubrics, keys = [], [], [], [], []
    for k in uniq_keys:
        t, x, p = k.split("||", 2)
        # GT 기반 Reference Answer 생성 (없으면 현재 라벨로 fallback)
        ref_answer, gt_pred_def, gt_type, gt_pred = build_reference_answer_for_string(x, t, p)
        instrs.append(make_instruction(t, x))
        resps.append(make_response(p))
        refs.append(f"Predicate definition: {gt_pred_def}\n{ref_answer}")
        rubrics.append(build_rubric(t, p))
        keys.append(k)

    scores_map, fb_map, bad_keys = {}, {}, []
    for i in tqdm(range(0, len(instrs), batch_size), desc=f"{name} (batched)"):
        ib, rb, refb, rubb, kb = instrs[i:i+batch_size], resps[i:i+batch_size], refs[i:i+batch_size], rubrics[i:i+batch_size], keys[i:i+batch_size]
        (fbs, scs), err_idx = judge_batch_with_retry(ib, rb, refb, rubb, max_depth=4)
        for j,(fb,sc) in enumerate(zip(fbs, scs)):
            scores_map[kb[j]] = sc
            fb_map[kb[j]] = fb
        bad_keys.extend([kb[j] for j in err_idx])

    # 유니크 결과를 work에 붙임
    work["prom_score_consistency"] = work["_key"].map(scores_map)
    work["prom_feedback"]          = work["_key"].map(fb_map)
    work["prom_ok"]                = ~work["_key"].isin(bad_keys)
    work.drop(columns=["_key"], inplace=True)

    # ---- (2) 원본 df와 병합: 원본 컬럼명 보존
    work_4merge = work[[col_string, col_type, col_pred, "prom_score_consistency", "prom_feedback", "prom_ok"]].copy()
    out_df = df_orig.merge(
        work_4merge,
        left_on=[col_string, col_type, col_pred],
        right_on=[col_string, col_type, col_pred],
        how="left"
    )

    # 저장: 원본과 같은 폴더에 *_with_prom.csv
    out_path = os.path.join(
        os.path.dirname(csv_path),
        f"{os.path.splitext(os.path.basename(csv_path))[0]}{out_suffix}"
    )
    out_df.to_csv(out_path, index=False, encoding="utf-8")

    # 요약 출력
    m = out_df["prom_score_consistency"]
    mean_score = pd.to_numeric(m, errors="coerce").mean()
    ok = out_df["prom_ok"].fillna(False).sum()
    total = len(out_df)
    print(f"[OK] {name}: total={total} | ok={ok} | mean={mean_score:.3f} → {out_path}")

    return out_path

In [10]:
outputs = {}
for name, path in FILES.items():
    outputs[name] = eval_and_merge_one(
        name, path,
        out_suffix="_with_prom.csv",
        batch_size=64,
        skip_nd=True,
        max_text_len=512
    )
outputs

template (batched):   0%|          | 0/25 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10075.27it/s]
template (batched):   4%|▍         | 1/25 [00:10<04:01, 10.06s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10769.30it/s]
template (batched):   8%|▊         | 2/25 [00:19<03:40,  9.60s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 11059.01it/s]
template (batched):  12%|█▏        | 3/25 [00:29<03:36,  9.86s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10485.76it/s]
template (batched):  16%|█▌        | 4/25 [00:39<03:26,  9.82s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9315.50it/s]
template (batched):  20%|██        | 5/25 [00:50<03:26, 10.34s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10533.90it/s]
template (batched):  24%|██▍       | 6/25 [01:00<03:17, 10.38s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10148.79it/s]
template (batched):  28%|██▊       | 7/25 [01:10<02:59,  9.96s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10242.50it/s]
template (batched):  32%|███▏      | 8/25 [01:19<02:44,  9.66s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 11460.83it/s]
template (batched):  36%|███▌      | 9/25 [01:27<02:30,  9.40s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9566.14it/s]
template (batched):  40%|████      | 10/25 [01:39<02:30, 10.04s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10041.73it/s]
template (batched):  44%|████▍     | 11/25 [01:49<02:19,  9.94s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9691.16it/s]
template (batched):  48%|████▊     | 12/25 [02:01<02:18, 10.64s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 11116.26it/s]
template (batched):  52%|█████▏    | 13/25 [02:10<02:01, 10.10s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10238.98it/s]
template (batched):  56%|█████▌    | 14/25 [02:19<01:47,  9.74s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9223.64it/s]
template (batched):  60%|██████    | 15/25 [02:28<01:35,  9.56s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10827.07it/s]
template (batched):  64%|██████▍   | 16/25 [02:37<01:25,  9.45s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9598.29it/s]
template (batched):  68%|██████▊   | 17/25 [02:46<01:15,  9.40s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10786.17it/s]
template (batched):  72%|███████▏  | 18/25 [02:55<01:04,  9.15s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 8492.64it/s]
template (batched):  76%|███████▌  | 19/25 [03:05<00:56,  9.45s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10687.83it/s]
template (batched):  80%|████████  | 20/25 [03:15<00:48,  9.60s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9204.97it/s]
template (batched):  84%|████████▍ | 21/25 [03:24<00:37,  9.31s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9922.58it/s]
template (batched):  88%|████████▊ | 22/25 [03:36<00:30, 10.13s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9990.15it/s]
template (batched):  92%|█████████▏| 23/25 [03:48<00:21, 10.87s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9609.98it/s]
template (batched):  96%|█████████▌| 24/25 [03:57<00:10, 10.27s/it]

Adding requests:   0%|          | 0/53 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/53 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 53/53 instances.



Finalizing: 100%|██████████| 53/53 [00:00<00:00, 7937.80it/s]
template (batched): 100%|██████████| 25/25 [04:07<00:00,  9.88s/it]

[OK] template: total=3222 | ok=1621 | mean=4.244 → /content/drive/MyDrive/404DNF/template_predicate_with_prom.csv



  ok = out_df["prom_ok"].fillna(False).sum()
contextual (batched):   0%|          | 0/23 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 8974.77it/s]
contextual (batched):   4%|▍         | 1/23 [00:09<03:26,  9.38s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10625.22it/s]
contextual (batched):   9%|▊         | 2/23 [00:18<03:07,  8.95s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10140.74it/s]
contextual (batched):  13%|█▎        | 3/23 [00:26<02:54,  8.71s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9036.10it/s]
contextual (batched):  17%|█▋        | 4/23 [00:36<02:52,  9.07s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9924.04it/s]
contextual (batched):  22%|██▏       | 5/23 [00:45<02:42,  9.04s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 7200.52it/s]
contextual (batched):  26%|██▌       | 6/23 [00:53<02:32,  8.97s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9399.00it/s]
contextual (batched):  30%|███       | 7/23 [01:03<02:29,  9.32s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10389.58it/s]
contextual (batched):  35%|███▍      | 8/23 [01:12<02:16,  9.12s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9540.98it/s]
contextual (batched):  39%|███▉      | 9/23 [01:22<02:11,  9.41s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 11380.65it/s]
contextual (batched):  43%|████▎     | 10/23 [01:30<01:54,  8.84s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10570.41it/s]
contextual (batched):  48%|████▊     | 11/23 [01:38<01:44,  8.73s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10202.79it/s]
contextual (batched):  52%|█████▏    | 12/23 [01:47<01:34,  8.61s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10245.63it/s]
contextual (batched):  57%|█████▋    | 13/23 [01:57<01:30,  9.06s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 5684.66it/s]
contextual (batched):  61%|██████    | 14/23 [02:06<01:21,  9.05s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 5985.72it/s]
contextual (batched):  65%|██████▌   | 15/23 [02:15<01:14,  9.29s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 7500.29it/s]
contextual (batched):  70%|██████▉   | 16/23 [02:24<01:03,  9.10s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10834.50it/s]
contextual (batched):  74%|███████▍  | 17/23 [02:33<00:54,  9.02s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 5683.10it/s]
contextual (batched):  78%|███████▊  | 18/23 [02:41<00:43,  8.78s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9789.05it/s]
contextual (batched):  83%|████████▎ | 19/23 [02:51<00:35,  8.97s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 5688.40it/s]
contextual (batched):  87%|████████▋ | 20/23 [02:59<00:26,  8.80s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9893.68it/s]
contextual (batched):  91%|█████████▏| 21/23 [03:10<00:19,  9.54s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9691.86it/s]
contextual (batched):  96%|█████████▌| 22/23 [03:21<00:09,  9.89s/it]

Adding requests:   0%|          | 0/60 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/60 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 60/60 instances.



Finalizing: 100%|██████████| 60/60 [00:00<00:00, 8946.26it/s]
contextual (batched): 100%|██████████| 23/23 [03:30<00:00,  9.15s/it]

[OK] contextual: total=3496 | ok=1894 | mean=4.221 → /content/drive/MyDrive/404DNF/contextual_predicate_with_prom.csv



  ok = out_df["prom_ok"].fillna(False).sum()
paraphrase (batched):   0%|          | 0/18 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10770.59it/s]
paraphrase (batched):   6%|▌         | 1/18 [00:09<02:45,  9.73s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10079.81it/s]
paraphrase (batched):  11%|█         | 2/18 [00:18<02:29,  9.32s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 7541.80it/s]
paraphrase (batched):  17%|█▋        | 3/18 [00:28<02:21,  9.40s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 8983.78it/s]
paraphrase (batched):  22%|██▏       | 4/18 [00:37<02:13,  9.52s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9143.52it/s]
paraphrase (batched):  28%|██▊       | 5/18 [00:46<02:01,  9.31s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10539.69it/s]
paraphrase (batched):  33%|███▎      | 6/18 [00:55<01:50,  9.19s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9136.98it/s]
paraphrase (batched):  39%|███▉      | 7/18 [01:06<01:45,  9.63s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 8626.65it/s]
paraphrase (batched):  44%|████▍     | 8/18 [01:14<01:32,  9.29s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 8841.46it/s]
paraphrase (batched):  50%|█████     | 9/18 [01:24<01:25,  9.49s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9217.93it/s]
paraphrase (batched):  56%|█████▌    | 10/18 [01:34<01:17,  9.66s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9425.40it/s]
paraphrase (batched):  61%|██████    | 11/18 [01:45<01:09,  9.93s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9169.44it/s]
paraphrase (batched):  67%|██████▋   | 12/18 [01:55<00:59,  9.97s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 5705.32it/s]
paraphrase (batched):  72%|███████▏  | 13/18 [02:04<00:47,  9.57s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9325.21it/s]
paraphrase (batched):  78%|███████▊  | 14/18 [02:12<00:36,  9.24s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 9215.40it/s]
paraphrase (batched):  83%|████████▎ | 15/18 [02:22<00:27,  9.28s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10770.16it/s]
paraphrase (batched):  89%|████████▉ | 16/18 [02:30<00:18,  9.04s/it]

Adding requests:   0%|          | 0/64 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 64/64 instances.



Finalizing: 100%|██████████| 64/64 [00:00<00:00, 10757.64it/s]
paraphrase (batched):  94%|█████████▍| 17/18 [02:39<00:08,  8.97s/it]

Adding requests:   0%|          | 0/29 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/29 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 29/29 instances.



Finalizing: 100%|██████████| 29/29 [00:00<00:00, 7680.42it/s]
paraphrase (batched): 100%|██████████| 18/18 [02:47<00:00,  9.30s/it]

[OK] paraphrase: total=4582 | ok=2980 | mean=4.141 → /content/drive/MyDrive/404DNF/paraphrase_predicate_with_prom.csv



  ok = out_df["prom_ok"].fillna(False).sum()


{'template': '/content/drive/MyDrive/404DNF/template_predicate_with_prom.csv',
 'contextual': '/content/drive/MyDrive/404DNF/contextual_predicate_with_prom.csv',
 'paraphrase': '/content/drive/MyDrive/404DNF/paraphrase_predicate_with_prom.csv'}

In [11]:
for name, path in outputs.items():
    print("\n===", name, "===")
    print(path)
    display(pd.read_csv(path).head(3))


=== template ===
/content/drive/MyDrive/404DNF/template_predicate_with_prom.csv


Unnamed: 0,String,Type,label,predicate,prom_score_consistency,prom_feedback,prom_ok
0,Seen on product page 79 clicks today,Social Proof,1,Activity Notifications,3.0,"The given text (""Seen on product page 79 click...",True
1,Flash Sale ends in SHOP NOW,Urgency,1,Limited-time Messages,3.0,Upon reviewing the given text and the predicte...,True
2,Flash Sale ends in SHOP NOW,Urgency,1,Limited-time Messages,3.0,Upon reviewing the given text and the predicte...,True



=== contextual ===
/content/drive/MyDrive/404DNF/contextual_predicate_with_prom.csv


Unnamed: 0,String,Type,label,predicate,prom_score_consistency,prom_feedback,prom_ok
0,19 people buy this product per day,Social Proof,1,Activity Notifications,3.0,"Upon examining the response, it is evident tha...",True
1,Flash Sale ends in SHOP NOW,Urgency,1,Limited-time Messages,3.0,"The response selected the predicate ""Limited-t...",True
2,Flash Sale ends in SHOP NOW,Urgency,1,Limited-time Messages,3.0,"The response selected the predicate ""Limited-t...",True



=== paraphrase ===
/content/drive/MyDrive/404DNF/paraphrase_predicate_with_prom.csv


Unnamed: 0,String,Type,label,predicate,prom_score_consistency,prom_feedback,prom_ok
0,1 person is looking at this item.,Social Proof,1,Activity Notifications,5.0,"Upon review, the provided text demonstrates a ...",True
1,1 person is looking at this item.,Social Proof,1,Activity Notifications,5.0,"Upon review, the provided text demonstrates a ...",True
2,Flash Sale ends in SHOP NOW,Urgency,1,Limited-time Messages,3.0,The response accurately identifies the text's ...,True


#

In [12]:
import os, pandas as pd
from tqdm import tqdm

# 유연한 컬럼 매핑(대소문자/공백/동의어 방어)
def _pick_cols(df):
    cols = [c.strip() for c in df.columns]
    low = {c.lower(): c for c in cols}
    def get(cands):
        for k in cands:
            if k in low: return low[k]
        return None
    col_string = get(["string","text","sentence"])
    col_type   = get(["type","label_type","class","category"])
    col_pred   = get(["predicate","label","pred","subtype"])
    miss = [n for n,v in {"String":col_string,"Type":col_type,"Predicate":col_pred}.items() if v is None]
    if miss:
        raise AssertionError(f"need columns {miss}, got={list(df.columns)}")
    return col_string, col_type, col_pred

def _eval_unique(keys, build_payload_fn, batch_size=32, max_depth=5):
    # keys: list of (t, x, p) 문자열 튜플
    instrs, resps, refs, rubrics = [], [], [], []
    for t,x,p in keys:
        ref_answer, gt_pred_def, _, _ = build_reference_answer_for_string(x, t, p)
        instrs.append(make_instruction(t, x))
        resps.append(make_response(p))
        refs.append(f"Predicate definition: {gt_pred_def}\n{ref_answer}")
        rubrics.append(build_rubric(t, p))

    scores_map, fb_map, bad_idx = {}, {}, []

    for i in tqdm(range(0, len(instrs), batch_size), desc=f"retry(batched={batch_size})"):
        ib = instrs[i:i+batch_size]
        rb = resps[i:i+batch_size]
        refb = refs[i:i+batch_size]
        rubb = rubrics[i:i+batch_size]
        (fbs, scs), err_rel = judge_batch_with_retry(ib, rb, refb, rubb, max_depth=max_depth)
        for j,(fb,sc) in enumerate(zip(fbs, scs)):
            k = keys[i+j]
            scores_map[k] = sc
            fb_map[k] = fb
        bad_idx.extend([i+j for j in err_rel])
    bad_keys = [keys[j] for j in bad_idx]
    return scores_map, fb_map, bad_keys

def retry_fill_failed(
    in_out_csv_path: str,
    out_suffix: str = "_retry1.csv",
    batch_schedule=(32, 16, 8),   # 점점 작게 시도
    skip_nd: bool = True,
    max_depth: int = 5
):
    assert os.path.exists(in_out_csv_path), in_out_csv_path
    df = pd.read_csv(in_out_csv_path)

    # 필수 열 찾기 (원본 컬럼명 유지)
    col_string, col_type, col_pred = _pick_cols(df)

    # 재시도 대상: prom_ok != True (False or NaN) 또는 점수 NaN
    mask_fail = (df.get("prom_ok", False) != True) | (pd.to_numeric(df.get("prom_score_consistency", None), errors="coerce").isna())
    work = df[mask_fail].copy()
    if len(work) == 0:
        print("[INFO] 재평가할 실패 행이 없습니다.")
        return in_out_csv_path

    # ND는 건너뛰기(원한다면 False로 꺼서 ND도 평가 가능)
    if skip_nd:
        work = work[work[col_type].astype(str).str.lower() != "not dark pattern"]

    if len(work) == 0:
        print("[INFO] 실패 행은 있었지만 ND만 남아 재평가할 항목이 없습니다.")
        return in_out_csv_path

    # 유니크 (Type, String, Predicate) 기준으로 중복 제거
    uniq = work[[col_type, col_string, col_pred]].astype(str).drop_duplicates()
    keys = list(map(tuple, uniq.values.tolist()))  # [(t,x,p), ...]

    # 스케줄대로 점점 작은 배치로 재시도
    scores_map, fb_map, bad_keys_total = {}, {}, set()
    remaining = keys
    for bs in batch_schedule:
        if not remaining:
            break
        s_map, f_map, bad = _eval_unique(remaining, build_reference_answer_for_string, batch_size=bs, max_depth=max_depth)
        # 성공분/실패분 분리
        for k,v in s_map.items():
            if v is not None: scores_map[k] = v
        for k,v in f_map.items():
            fb_map[k] = v
        bad_keys_total.update(bad)
        # 다음 라운드에서는 점수 못 받은 것만 다시
        remaining = [k for k in remaining if (k not in scores_map) or (scores_map[k] is None)]

    # 결과 반영
    # 키열을 문자열로 만들어 map
    def _krow(r): return (str(r[col_type]), str(r[col_string]), str(r[col_pred]))
    sel = df.index[mask_fail]
    new_scores = []
    new_fb = []
    new_ok = []
    for idx in sel:
        k = _krow(df.loc[idx])
        sc = scores_map.get(k, pd.NA)
        fb = fb_map.get(k, pd.NA)
        new_scores.append(sc)
        new_fb.append(fb)
        new_ok.append(pd.notna(sc))

    df.loc[sel, "prom_score_consistency"] = new_scores
    df.loc[sel, "prom_feedback"]          = new_fb
    df.loc[sel, "prom_ok"]                = new_ok

    # 저장 경로: *_retry1.csv
    out_path = os.path.splitext(in_out_csv_path)[0] + out_suffix
    df.to_csv(out_path, index=False, encoding="utf-8")

    # 요약
    total = len(df)
    ok = df["prom_ok"].fillna(False).sum()
    mean_score = pd.to_numeric(df["prom_score_consistency"], errors="coerce").mean()
    print(f"[RETRY-OK] total={total} | ok={ok} | mean={mean_score:.3f} → {out_path}")

    # 남은 실패 개수도 알려주기
    remain = ((df["prom_ok"] != True) | pd.to_numeric(df["prom_score_consistency"], errors="coerce").isna()).sum()
    print(f"[RETRY] 남은 실패 건수: {remain}")

    return out_path

In [13]:
# outputs 딕셔너리에 앞서 평가된 파일 경로들이 들어있다고 가정
retry_outputs = {}
for name, merged_path in outputs.items():
    print(f"\n== RETRY: {name} ==")
    retry_outputs[name] = retry_fill_failed(
        in_out_csv_path=merged_path,
        out_suffix="_retry1.csv",   # 저장될 파일명 접미사
        batch_schedule=(32,16,8),   # 점점 줄여가며 재시도
        skip_nd=True,               # ND 샘플은 생략 (False로 바꾸면 ND도 재평가)
        max_depth=6                 # 분할 재시도 깊이
    )


== RETRY: template ==


retry(batched=32):   0%|          | 0/1 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.



Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3912.60it/s]
retry(batched=32): 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]
  df.loc[sel, "prom_score_consistency"] = new_scores
  ok = df["prom_ok"].fillna(False).sum()


[RETRY-OK] total=3222 | ok=1622 | mean=4.244 → /content/drive/MyDrive/404DNF/template_predicate_with_prom_retry1.csv
[RETRY] 남은 실패 건수: 1600

== RETRY: contextual ==


retry(batched=32):   0%|          | 0/1 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.



Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3751.61it/s]
retry(batched=32): 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]

[RETRY-OK] total=3496 | ok=1896 | mean=4.222 → /content/drive/MyDrive/404DNF/contextual_predicate_with_prom_retry1.csv
[RETRY] 남은 실패 건수: 1600

== RETRY: paraphrase ==



  df.loc[sel, "prom_score_consistency"] = new_scores
  ok = df["prom_ok"].fillna(False).sum()
retry(batched=32):   0%|          | 0/1 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.



Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3524.63it/s]
retry(batched=32): 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
  df.loc[sel, "prom_score_consistency"] = new_scores
  ok = df["prom_ok"].fillna(False).sum()


[RETRY-OK] total=4582 | ok=2982 | mean=4.141 → /content/drive/MyDrive/404DNF/paraphrase_predicate_with_prom_retry1.csv
[RETRY] 남은 실패 건수: 1600


In [14]:
import pandas as pd

path = "/content/drive/MyDrive/404DNF/template_predicate_with_prom_retry1.csv"  # 파일 바꿔가며 확인
df = pd.read_csv(path)

# Type/Predicate 컬럼명 자동 탐색
def pick(df):
    low = {c.lower(): c for c in df.columns}
    g = lambda ks: next((low[k] for k in ks if k in low), None)
    col_type = g(["type","label_type","class","category"])
    col_pred = g(["predicate","label","pred","subtype"])
    return col_type, col_pred

col_type, col_pred = pick(df)

# 1) ND(=Not Dark Pattern) 총 개수
nd_mask = df[col_type].astype(str).str.lower().eq("not dark pattern")
nd_count = int(nd_mask.sum())

# 2) 실패 집계(현재 로직): prom_ok != True 또는 점수 NaN
fail_mask = (df.get("prom_ok", False) != True) | pd.to_numeric(df.get("prom_score_consistency"), errors="coerce").isna()
fail_count = int(fail_mask.sum())

# 3) 실패 중 ND가 차지하는 개수
nd_as_fail = int((nd_mask & fail_mask).sum())

# 4) Non-ND 실패(=진짜 모델 실패) 개수
non_nd_fail = fail_count - nd_as_fail

print(f"[{path.split('/')[-1]}]")
print("ND rows               :", nd_count)
print("Failures (current def):", fail_count)
print(" ┗ ND counted as fail :", nd_as_fail)
print(" ┗ Non-ND failures    :", non_nd_fail)

[template_predicate_with_prom_retry1.csv]
ND rows               : 1600
Failures (current def): 1600
 ┗ ND counted as fail : 1600
 ┗ Non-ND failures    : 0


In [15]:
import pandas as pd

path = "/content/drive/MyDrive/404DNF/paraphrase_predicate_with_prom_retry1.csv"  # 파일 바꿔가며 확인
df = pd.read_csv(path)

# Type/Predicate 컬럼명 자동 탐색
def pick(df):
    low = {c.lower(): c for c in df.columns}
    g = lambda ks: next((low[k] for k in ks if k in low), None)
    col_type = g(["type","label_type","class","category"])
    col_pred = g(["predicate","label","pred","subtype"])
    return col_type, col_pred

col_type, col_pred = pick(df)

# 1) ND(=Not Dark Pattern) 총 개수
nd_mask = df[col_type].astype(str).str.lower().eq("not dark pattern")
nd_count = int(nd_mask.sum())

# 2) 실패 집계(현재 로직): prom_ok != True 또는 점수 NaN
fail_mask = (df.get("prom_ok", False) != True) | pd.to_numeric(df.get("prom_score_consistency"), errors="coerce").isna()
fail_count = int(fail_mask.sum())

# 3) 실패 중 ND가 차지하는 개수
nd_as_fail = int((nd_mask & fail_mask).sum())

# 4) Non-ND 실패(=진짜 모델 실패) 개수
non_nd_fail = fail_count - nd_as_fail

print(f"[{path.split('/')[-1]}]")
print("ND rows               :", nd_count)
print("Failures (current def):", fail_count)
print(" ┗ ND counted as fail :", nd_as_fail)
print(" ┗ Non-ND failures    :", non_nd_fail)

[paraphrase_predicate_with_prom_retry1.csv]
ND rows               : 1600
Failures (current def): 1600
 ┗ ND counted as fail : 1600
 ┗ Non-ND failures    : 0


In [16]:
import pandas as pd

path = "/content/drive/MyDrive/404DNF/contextual_predicate_with_prom_retry1.csv"  # 파일 바꿔가며 확인
df = pd.read_csv(path)

# Type/Predicate 컬럼명 자동 탐색
def pick(df):
    low = {c.lower(): c for c in df.columns}
    g = lambda ks: next((low[k] for k in ks if k in low), None)
    col_type = g(["type","label_type","class","category"])
    col_pred = g(["predicate","label","pred","subtype"])
    return col_type, col_pred

col_type, col_pred = pick(df)

# 1) ND(=Not Dark Pattern) 총 개수
nd_mask = df[col_type].astype(str).str.lower().eq("not dark pattern")
nd_count = int(nd_mask.sum())

# 2) 실패 집계(현재 로직): prom_ok != True 또는 점수 NaN
fail_mask = (df.get("prom_ok", False) != True) | pd.to_numeric(df.get("prom_score_consistency"), errors="coerce").isna()
fail_count = int(fail_mask.sum())

# 3) 실패 중 ND가 차지하는 개수
nd_as_fail = int((nd_mask & fail_mask).sum())

# 4) Non-ND 실패(=진짜 모델 실패) 개수
non_nd_fail = fail_count - nd_as_fail

print(f"[{path.split('/')[-1]}]")
print("ND rows               :", nd_count)
print("Failures (current def):", fail_count)
print(" ┗ ND counted as fail :", nd_as_fail)
print(" ┗ Non-ND failures    :", non_nd_fail)

[contextual_predicate_with_prom_retry1.csv]
ND rows               : 1600
Failures (current def): 1600
 ┗ ND counted as fail : 1600
 ┗ Non-ND failures    : 0


# 1 평가 점수 몇개인지

In [19]:
import pandas as pd

# CSV 경로
csv_path = "/content/drive/MyDrive/404DNF/template_predicate_with_prom_retry1.csv"

df = pd.read_csv(csv_path)

# 점수 숫자 변환
scores = pd.to_numeric(df["prom_score_consistency"], errors="coerce")

# 1점인 행만 필터링
df_1 = df[scores == 1]

# 1점 개수 및 비율
count_1 = len(df_1)
ratio_1 = count_1 / len(df) * 100

print(f"총 행 수: {len(df)}")
print(f"1점 개수: {count_1} ({ratio_1:.2f}%)")

# Type별 집계
type_counts = df_1["Type"].value_counts()
print("\n[1점 행의 Type 분포]")
print(type_counts)

# Predicate별 집계
pred_counts = df_1["predicate"].value_counts()
print("\n[1점 행의 Predicate 분포]")
print(pred_counts)

# Type + Predicate 조합별 집계
combo_counts = df_1.groupby(["Type", "predicate"]).size().sort_values(ascending=False)
print("\n[1점 행의 Type-Predicate 조합 분포]")
print(combo_counts)

총 행 수: 3222
1점 개수: 137 (4.25%)

[1점 행의 Type 분포]
Type
Social Proof    64
Misdirection    59
Scarcity        11
Urgency          3
Name: count, dtype: int64

[1점 행의 Predicate 분포]
predicate
Activity Notifications              60
Confirmshaming                      29
Pressured Selling                   16
Trick Questions                     14
High-demand Messages                 8
Testimonials of Uncertain Origin     4
Low-stock Messages                   3
Countdown Timers                     2
Limited-time Messages                1
Name: count, dtype: int64

[1점 행의 Type-Predicate 조합 분포]
Type          predicate                       
Social Proof  Activity Notifications              60
Misdirection  Confirmshaming                      29
              Pressured Selling                   16
              Trick Questions                     14
Scarcity      High-demand Messages                 8
Social Proof  Testimonials of Uncertain Origin     4
Scarcity      Low-stock Messages         