template.csv predicate mapping

In [14]:
# === Dark Pattern predicate labeling (Notebook one-shot; keeps ALL rows) ===
# Prereq: pip install openai python-dotenv pandas
# Uses: .env -> OPENAI_API_KEY
import os, re, json, time
import pandas as pd
from dotenv import load_dotenv

# -------- Settings --------
load_dotenv()  # read OPENAI_API_KEY from .env
INPUT_CSV = "../data/processed/template.csv"   # <- change if needed
MODEL     = "gpt-4o"                        # default model
OUT_PATH  = os.path.join(os.path.dirname(INPUT_CSV), "../data/processed/template_merged_output.csv")

# Include long type definitions in prompts? (True=better quality, more tokens; False=lean)
INCLUDE_DEFINITIONS = True

# -------- Taxonomy (refined) --------
CANONICAL_TYPES = ["Urgency","Misdirection","Social Proof","Scarcity","Not Dark Pattern"]

PREDICATES = {
    "Urgency":       ["Countdown Timers", "Limited-time Messages"],
    "Misdirection":  ["Confirmshaming", "Trick Questions", "Pressured Selling"],
    "Social Proof":  ["Activity Notifications", "Testimonials of Uncertain Origin"],
    "Scarcity":      ["Low-stock Messages", "High-demand Messages"],
    "Not Dark Pattern": ["None"],
}

DEFINITIONS = {
    "Urgency": {
        "Definition": (
            "Urgency dark patterns pressure users by limiting available time, which reduces their ability to "
            "carefully evaluate information and may cause stress or anxiety. This pressure can be exploited to push "
            "users into actions not in their best interest."
        ),
        "Predicates": {
            "Countdown Timers": "A visible timer showing that a deal or discount will expire soon.",
            "Limited-time Messages": "Claims that a deal or sale will end 'soon' without providing a clear deadline.",
        },
    },
    "Misdirection": {
        "Definition": (
            "Misdirection manipulates user attention by distracting or confusing them. It emphasizes certain options "
            "while hiding or downplaying others, leading users to make unintended choices."
        ),
        "Predicates": {
            "Confirmshaming": "Uses shame or emotional wording to discourage a certain choice.",
            "Trick Questions": "Uses confusing or ambiguous wording to steer users toward unintended choices.",
            "Pressured Selling": "Preselects or pressures users to accept more expensive product options or add-ons.",
        },
    },
    "Social Proof": {
        "Definition": (
            "Social Proof exploits social cues to influence behavior. It creates the perception that others are "
            "already acting, pressuring users to conform and undermining independent decision-making."
        ),
        "Predicates": {
            "Activity Notifications": "Real or simulated messages like '5 people just bought this' to induce quick action; often exaggerated/fabricated.",
            "Testimonials of Uncertain Origin": "Reviews/ratings/endorsements without reliable sources, designed to build false trust.",
        },
    },
    "Scarcity": {
        "Definition": (
            "Scarcity creates a false or exaggerated sense of limited availability, exploiting fear of missing out "
            "(FOMO) to push rushed decisions."
        ),
        "Predicates": {
            "Low-stock Messages": "Warnings like 'Only 2 left in stock', often exaggerated or fabricated.",
            "High-demand Messages": "Claims like '50 people are viewing this now', creating artificial competition/urgency.",
        },
    },
    "Not Dark Pattern": {
        "Definition": "Content that does not represent any dark pattern in this taxonomy.",
        "Predicates": {"None": "Always used for this Type."},
    },
}

# -------- Prompt builders --------
SYSTEM_PROMPT = (
    "ROLE: Dark-Pattern Expert Annotator.\n"
    "Return ONLY a compact JSON object with keys: predicate, confidence, rationale.\n"
    "Pick exactly ONE predicate from the allowed list for the given Type.\n"
    "If Type is 'Not Dark Pattern', predicate must be 'None'. Rationale <= 200 chars."
)

def build_user_prompt(type_name: str, text: str) -> str:
    lines = []
    if INCLUDE_DEFINITIONS:
        info = DEFINITIONS.get(type_name, {})
        if "Definition" in info:
            lines.append(f"- Definition: {info['Definition']}")
        preds = info.get("Predicates", {})
        if preds:
            lines.append("- Predicate definitions:")
            for k, v in preds.items():
                lines.append(f"  * {k}: {v}")
    allowed_list = "\n".join(f"- {p}" for p in PREDICATES[type_name])
    return f"""Assign exactly ONE predicate for the text, constrained by the Type.

Type: {type_name}

{'\n'.join(lines) if lines else ''}

Allowed predicates (choose exactly ONE; return the exact string):
{allowed_list}

Text:
\"\"\"{text}\"\"\"

Return JSON only:
{{"predicate": "...", "confidence": 0.0-1.0, "rationale": "..."}}"""

# -------- OpenAI call with robust fallbacks (older SDKs ok) --------
def _coerce_result(data: dict, allowed: list) -> dict:
    pred = str(data.get("predicate", "")).strip()
    if pred == "Pressured Seliing":  # typo guard
        pred = "Pressured Selling"
    if pred not in allowed:
        pred = "None" if "None" in allowed else allowed[0]

    conf = data.get("confidence", None)
    try:
        conf = float(conf)
    except Exception:
        m = re.search(r"confidence\s*[:=]\s*([01](?:\.\d+)?)", json.dumps(data), flags=re.I)
        conf = float(m.group(1)) if m else 0.5
    conf = min(1.0, max(0.0, conf))

    rat = str(data.get("rationale", "")).strip() or "Auto-filled rationale."
    return {"predicate": pred, "confidence": conf, "rationale": rat[:300]}

def call_openai(type_name: str, text: str, model: str) -> str:
    """Return a predicate constrained to the allowed list for the given type."""
    from openai import OpenAI
    client = OpenAI()

    allowed = PREDICATES[type_name]
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_user_prompt(type_name, text)},
    ]

    json_schema = {
        "type": "object",
        "properties": {
            "predicate": {"type": "string", "enum": allowed},
            "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
            "rationale": {"type": "string", "maxLength": 200},
        },
        "required": ["predicate", "confidence", "rationale"],
        "additionalProperties": False,
    }

    backoff, max_backoff = 1.0, 30.0
    while True:
        try:
            # 1) tools (newer function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    tools=[{"type":"function","function":{"name":"set_predicate","description":"label","parameters":json_schema}}],
                    tool_choice={"type":"function","function":{"name":"set_predicate"}},
                )
                msg = resp.choices[0].message
                if getattr(msg, "tool_calls", None):
                    args = json.loads(msg.tool_calls[0].function.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 2) functions (older function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    functions=[{"name":"set_predicate","description":"label","parameters":json_schema}],
                    function_call={"name":"set_predicate"},
                )
                msg = resp.choices[0].message
                if getattr(msg, "function_call", None):
                    args = json.loads(msg.function_call.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 3) plain completion → parse JSON
            resp = client.chat.completions.create(model=model, messages=messages, temperature=0.0)
            text_out = resp.choices[0].message.content or ""
            text_out = re.sub(r"^```(?:json)?\s*|\s*```$", "", text_out, flags=re.I|re.M).strip()
            try:
                data = json.loads(text_out)
            except Exception:
                m = re.search(r"predicate\s*[:=]\s*([^\n\r]+)", text_out, flags=re.I)
                data = {"predicate": (m.group(1).strip() if m else ""), "confidence": 0.5, "rationale": text_out[:200]}
            return _coerce_result(data, allowed)["predicate"]

        except Exception as e:
            if any(s in str(e).lower() for s in ["rate","timeout","overloaded","temporarily","503","502"]):
                time.sleep(backoff); backoff = min(max_backoff, backoff*2); continue
            raise

# -------- Main (keeps ALL rows; ND is fixed to 'None'; cache reuse) --------
df = pd.read_csv(INPUT_CSV)
df = df[df["Type"].isin(CANONICAL_TYPES)].copy()  # safety
df = df.reset_index(drop=True)

cache = {}  # (Type, String) -> predicate
preds = []

for _, row in df.iterrows():
    t = str(row["Type"]).strip()
    s = str(row["String"]).strip()

    if t == "Not Dark Pattern":
        preds.append("None")  # no API call
        continue

    key = (t, s)
    if key in cache:
        preds.append(cache[key])
        continue

    pred = call_openai(t, s, MODEL)
    if pred not in PREDICATES[t]:
        pred = PREDICATES[t][0]  # hard guard (never changes Type)
    cache[key] = pred
    preds.append(pred)

out = df.copy()
out["predicate"] = preds
out = out[["String","Type","label","predicate"]]

os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
out.to_csv(OUT_PATH, index=False, encoding="utf-8")

print(f"[OK] saved → {OUT_PATH}")
print("Counts by Type:\n", out["Type"].value_counts())
print("\nCrosstab (Type × predicate):\n", pd.crosstab(out["Type"], out["predicate"]))


[OK] saved → ../data/processed/../data/processed/template_merged_output.csv
Counts by Type:
 Type
Not Dark Pattern    1000
Social Proof         500
Urgency              500
Scarcity             500
Misdirection         500
Name: count, dtype: int64

Crosstab (Type × predicate):
 predicate         Activity Notifications  Confirmshaming  Countdown Timers  \
Type                                                                         
Misdirection                           0             298                 0   
Not Dark Pattern                       0               0                 0   
Scarcity                               0               0                 0   
Social Proof                         499               0                 0   
Urgency                                0               0               288   

predicate         High-demand Messages  Limited-time Messages  \
Type                                                            
Misdirection                         0     

In [16]:
import pandas as pd

df = pd.read_csv("../data/processed/template_merged_output.csv")

# predicate 값별 개수
counts = df["predicate"].value_counts()

print(counts)


predicate
Activity Notifications              499
Low-stock Messages                  462
Confirmshaming                      298
Countdown Timers                    288
Limited-time Messages               212
Pressured Selling                   140
Trick Questions                      62
High-demand Messages                 38
Testimonials of Uncertain Origin      1
Name: count, dtype: int64


contextual.csv predicate mapping

In [17]:
# === Dark Pattern predicate labeling (Notebook one-shot; keeps ALL rows) ===
# Prereq: pip install openai python-dotenv pandas
# Uses: .env -> OPENAI_API_KEY
import os, re, json, time
import pandas as pd
from dotenv import load_dotenv

# -------- Settings --------
load_dotenv()  # read OPENAI_API_KEY from .env
INPUT_CSV = "../data/processed/contextual.csv"   # <- input 변경
MODEL     = "gpt-4o"                             # default model
OUT_PATH  = os.path.join(os.path.dirname(INPUT_CSV), "contextual_predicate.csv")  # <- output 변경

# Include long type definitions in prompts? (True=better quality, more tokens; False=lean)
INCLUDE_DEFINITIONS = True

# -------- Taxonomy (refined) --------
CANONICAL_TYPES = ["Urgency","Misdirection","Social Proof","Scarcity","Not Dark Pattern"]

PREDICATES = {
    "Urgency":       ["Countdown Timers", "Limited-time Messages"],
    "Misdirection":  ["Confirmshaming", "Trick Questions", "Pressured Selling"],
    "Social Proof":  ["Activity Notifications", "Testimonials of Uncertain Origin"],
    "Scarcity":      ["Low-stock Messages", "High-demand Messages"],
    "Not Dark Pattern": ["None"],
}

DEFINITIONS = {
    "Urgency": {
        "Definition": (
            "Urgency dark patterns pressure users by limiting available time, which reduces their ability to "
            "carefully evaluate information and may cause stress or anxiety. This pressure can be exploited to push "
            "users into actions not in their best interest."
        ),
        "Predicates": {
            "Countdown Timers": "A visible timer showing that a deal or discount will expire soon.",
            "Limited-time Messages": "Claims that a deal or sale will end 'soon' without providing a clear deadline.",
        },
    },
    "Misdirection": {
        "Definition": (
            "Misdirection manipulates user attention by distracting or confusing them. It emphasizes certain options "
            "while hiding or downplaying others, leading users to make unintended choices."
        ),
        "Predicates": {
            "Confirmshaming": "Uses shame or emotional wording to discourage a certain choice.",
            "Trick Questions": "Uses confusing or ambiguous wording to steer users toward unintended choices.",
            "Pressured Selling": "Preselects or pressures users to accept more expensive product options or add-ons.",
        },
    },
    "Social Proof": {
        "Definition": (
            "Social Proof exploits social cues to influence behavior. It creates the perception that others are "
            "already acting, pressuring users to conform and undermining independent decision-making."
        ),
        "Predicates": {
            "Activity Notifications": "Real or simulated messages like '5 people just bought this' to induce quick action; often exaggerated/fabricated.",
            "Testimonials of Uncertain Origin": "Reviews/ratings/endorsements without reliable sources, designed to build false trust.",
        },
    },
    "Scarcity": {
        "Definition": (
            "Scarcity creates a false or exaggerated sense of limited availability, exploiting fear of missing out "
            "(FOMO) to push rushed decisions."
        ),
        "Predicates": {
            "Low-stock Messages": "Warnings like 'Only 2 left in stock', often exaggerated or fabricated.",
            "High-demand Messages": "Claims like '50 people are viewing this now', creating artificial competition/urgency.",
        },
    },
    "Not Dark Pattern": {
        "Definition": "Content that does not represent any dark pattern in this taxonomy.",
        "Predicates": {"None": "Always used for this Type."},
    },
}

# -------- Prompt builders --------
SYSTEM_PROMPT = (
    "ROLE: Dark-Pattern Expert Annotator.\n"
    "Return ONLY a compact JSON object with keys: predicate, confidence, rationale.\n"
    "Pick exactly ONE predicate from the allowed list for the given Type.\n"
    "If Type is 'Not Dark Pattern', predicate must be 'None'. Rationale <= 200 chars."
)

def build_user_prompt(type_name: str, text: str) -> str:
    lines = []
    if INCLUDE_DEFINITIONS:
        info = DEFINITIONS.get(type_name, {})
        if "Definition" in info:
            lines.append(f"- Definition: {info['Definition']}")
        preds = info.get("Predicates", {})
        if preds:
            lines.append("- Predicate definitions:")
            for k, v in preds.items():
                lines.append(f"  * {k}: {v}")
    allowed_list = "\n".join(f"- {p}" for p in PREDICATES[type_name])
    return f"""Assign exactly ONE predicate for the text, constrained by the Type.

Type: {type_name}

{'\n'.join(lines) if lines else ''}

Allowed predicates (choose exactly ONE; return the exact string):
{allowed_list}

Text:
\"\"\"{text}\"\"\"

Return JSON only:
{{"predicate": "...", "confidence": 0.0-1.0, "rationale": "..."}}"""

# -------- OpenAI call with robust fallbacks (older SDKs ok) --------
def _coerce_result(data: dict, allowed: list) -> dict:
    pred = str(data.get("predicate", "")).strip()
    if pred == "Pressured Seliing":  # typo guard
        pred = "Pressured Selling"
    if pred not in allowed:
        pred = "None" if "None" in allowed else allowed[0]

    conf = data.get("confidence", None)
    try:
        conf = float(conf)
    except Exception:
        m = re.search(r"confidence\s*[:=]\s*([01](?:\.\d+)?)", json.dumps(data), flags=re.I)
        conf = float(m.group(1)) if m else 0.5
    conf = min(1.0, max(0.0, conf))

    rat = str(data.get("rationale", "")).strip() or "Auto-filled rationale."
    return {"predicate": pred, "confidence": conf, "rationale": rat[:300]}

def call_openai(type_name: str, text: str, model: str) -> str:
    """Return a predicate constrained to the allowed list for the given type."""
    from openai import OpenAI
    client = OpenAI()

    allowed = PREDICATES[type_name]
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_user_prompt(type_name, text)},
    ]

    json_schema = {
        "type": "object",
        "properties": {
            "predicate": {"type": "string", "enum": allowed},
            "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
            "rationale": {"type": "string", "maxLength": 200},
        },
        "required": ["predicate", "confidence", "rationale"],
        "additionalProperties": False,
    }

    backoff, max_backoff = 1.0, 30.0
    while True:
        try:
            # 1) tools (newer function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    tools=[{"type":"function","function":{"name":"set_predicate","description":"label","parameters":json_schema}}],
                    tool_choice={"type":"function","function":{"name":"set_predicate"}},
                )
                msg = resp.choices[0].message
                if getattr(msg, "tool_calls", None):
                    args = json.loads(msg.tool_calls[0].function.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 2) functions (older function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    functions=[{"name":"set_predicate","description":"label","parameters":json_schema}],
                    function_call={"name":"set_predicate"},
                )
                msg = resp.choices[0].message
                if getattr(msg, "function_call", None):
                    args = json.loads(msg.function_call.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 3) plain completion → parse JSON
            resp = client.chat.completions.create(model=model, messages=messages, temperature=0.0)
            text_out = resp.choices[0].message.content or ""
            text_out = re.sub(r"^```(?:json)?\s*|\s*```$", "", text_out, flags=re.I|re.M).strip()
            try:
                data = json.loads(text_out)
            except Exception:
                m = re.search(r"predicate\s*[:=]\s*([^\n\r]+)", text_out, flags=re.I)
                data = {"predicate": (m.group(1).strip() if m else ""), "confidence": 0.5, "rationale": text_out[:200]}
            return _coerce_result(data, allowed)["predicate"]

        except Exception as e:
            if any(s in str(e).lower() for s in ["rate","timeout","overloaded","temporarily","503","502"]):
                time.sleep(backoff); backoff = min(max_backoff, backoff*2); continue
            raise

# -------- Main (keeps ALL rows; ND is fixed to 'None'; cache reuse) --------
df = pd.read_csv(INPUT_CSV)
df = df[df["Type"].isin(CANONICAL_TYPES)].copy()  # safety
df = df.reset_index(drop=True)

cache = {}  # (Type, String) -> predicate
preds = []

for _, row in df.iterrows():
    t = str(row["Type"]).strip()
    s = str(row["String"]).strip()

    if t == "Not Dark Pattern":
        preds.append("None")  # no API call
        continue

    key = (t, s)
    if key in cache:
        preds.append(cache[key])
        continue

    pred = call_openai(t, s, MODEL)
    if pred not in PREDICATES[t]:
        pred = PREDICATES[t][0]  # hard guard (never changes Type)
    cache[key] = pred
    preds.append(pred)

# ✅ 최종 CSV에는 'predicate' 한 컬럼만 저장
out = pd.DataFrame({"predicate": preds})

os.makedirs(os.path.dirname(OUT_PATH) or ".", exist_ok=True)
out.to_csv(OUT_PATH, index=False, encoding="utf-8")

print(f"[OK] saved → {OUT_PATH}")
print("Counts by predicate:\n", out["predicate"].value_counts(dropna=False))


[OK] saved → ../data/processed/contextual_predicate.csv
Counts by predicate:
 predicate
None                                1000
Activity Notifications               499
Low-stock Messages                   463
Confirmshaming                       339
Countdown Timers                     296
Limited-time Messages                204
Pressured Selling                    114
Trick Questions                       47
High-demand Messages                  37
Testimonials of Uncertain Origin       1
Name: count, dtype: int64


paraphrase predicate mapping

In [19]:
# === Dark Pattern predicate labeling (Notebook one-shot; keeps ALL rows) ===
# Prereq: pip install openai python-dotenv pandas
# Uses: .env -> OPENAI_API_KEY
import os, re, json, time
import pandas as pd
from dotenv import load_dotenv

# -------- Settings --------
load_dotenv()  # read OPENAI_API_KEY from .env
INPUT_CSV = "../data/processed/paraphrase.csv"   # <- input 변경
MODEL     = "gpt-4o"                             # default model
OUT_PATH  = os.path.join(os.path.dirname(INPUT_CSV), "paraphrase_predicate.csv")  # <- output 변경

# Include long type definitions in prompts? (True=better quality, more tokens; False=lean)
INCLUDE_DEFINITIONS = True

# -------- Taxonomy (refined) --------
CANONICAL_TYPES = ["Urgency","Misdirection","Social Proof","Scarcity","Not Dark Pattern"]

PREDICATES = {
    "Urgency":       ["Countdown Timers", "Limited-time Messages"],
    "Misdirection":  ["Confirmshaming", "Trick Questions", "Pressured Selling"],
    "Social Proof":  ["Activity Notifications", "Testimonials of Uncertain Origin"],
    "Scarcity":      ["Low-stock Messages", "High-demand Messages"],
    "Not Dark Pattern": ["None"],
}

DEFINITIONS = {
    "Urgency": {
        "Definition": (
            "Urgency dark patterns pressure users by limiting available time, which reduces their ability to "
            "carefully evaluate information and may cause stress or anxiety. This pressure can be exploited to push "
            "users into actions not in their best interest."
        ),
        "Predicates": {
            "Countdown Timers": "A visible timer showing that a deal or discount will expire soon.",
            "Limited-time Messages": "Claims that a deal or sale will end 'soon' without providing a clear deadline.",
        },
    },
    "Misdirection": {
        "Definition": (
            "Misdirection manipulates user attention by distracting or confusing them. It emphasizes certain options "
            "while hiding or downplaying others, leading users to make unintended choices."
        ),
        "Predicates": {
            "Confirmshaming": "Uses shame or emotional wording to discourage a certain choice.",
            "Trick Questions": "Uses confusing or ambiguous wording to steer users toward unintended choices.",
            "Pressured Selling": "Preselects or pressures users to accept more expensive product options or add-ons.",
        },
    },
    "Social Proof": {
        "Definition": (
            "Social Proof exploits social cues to influence behavior. It creates the perception that others are "
            "already acting, pressuring users to conform and undermining independent decision-making."
        ),
        "Predicates": {
            "Activity Notifications": "Real or simulated messages like '5 people just bought this' to induce quick action; often exaggerated/fabricated.",
            "Testimonials of Uncertain Origin": "Reviews/ratings/endorsements without reliable sources, designed to build false trust.",
        },
    },
    "Scarcity": {
        "Definition": (
            "Scarcity creates a false or exaggerated sense of limited availability, exploiting fear of missing out "
            "(FOMO) to push rushed decisions."
        ),
        "Predicates": {
            "Low-stock Messages": "Warnings like 'Only 2 left in stock', often exaggerated or fabricated.",
            "High-demand Messages": "Claims like '50 people are viewing this now', creating artificial competition/urgency.",
        },
    },
    "Not Dark Pattern": {
        "Definition": "Content that does not represent any dark pattern in this taxonomy.",
        "Predicates": {"None": "Always used for this Type."},
    },
}

# -------- Prompt builders --------
SYSTEM_PROMPT = (
    "ROLE: Dark-Pattern Expert Annotator.\n"
    "Return ONLY a compact JSON object with keys: predicate, confidence, rationale.\n"
    "Pick exactly ONE predicate from the allowed list for the given Type.\n"
    "If Type is 'Not Dark Pattern', predicate must be 'None'. Rationale <= 200 chars."
)

def build_user_prompt(type_name: str, text: str) -> str:
    lines = []
    if INCLUDE_DEFINITIONS:
        info = DEFINITIONS.get(type_name, {})
        if "Definition" in info:
            lines.append(f"- Definition: {info['Definition']}")
        preds = info.get("Predicates", {})
        if preds:
            lines.append("- Predicate definitions:")
            for k, v in preds.items():
                lines.append(f"  * {k}: {v}")
    allowed_list = "\n".join(f"- {p}" for p in PREDICATES[type_name])
    return f"""Assign exactly ONE predicate for the text, constrained by the Type.

Type: {type_name}

{'\n'.join(lines) if lines else ''}

Allowed predicates (choose exactly ONE; return the exact string):
{allowed_list}

Text:
\"\"\"{text}\"\"\"

Return JSON only:
{{"predicate": "...", "confidence": 0.0-1.0, "rationale": "..."}}"""

# -------- OpenAI call with robust fallbacks (older SDKs ok) --------
def _coerce_result(data: dict, allowed: list) -> dict:
    pred = str(data.get("predicate", "")).strip()
    if pred == "Pressured Seliing":  # typo guard
        pred = "Pressured Selling"
    if pred not in allowed:
        pred = "None" if "None" in allowed else allowed[0]

    conf = data.get("confidence", None)
    try:
        conf = float(conf)
    except Exception:
        m = re.search(r"confidence\s*[:=]\s*([01](?:\.\d+)?)", json.dumps(data), flags=re.I)
        conf = float(m.group(1)) if m else 0.5
    conf = min(1.0, max(0.0, conf))

    rat = str(data.get("rationale", "")).strip() or "Auto-filled rationale."
    return {"predicate": pred, "confidence": conf, "rationale": rat[:300]}

def call_openai(type_name: str, text: str, model: str) -> str:
    """Return a predicate constrained to the allowed list for the given type."""
    from openai import OpenAI
    client = OpenAI()

    allowed = PREDICATES[type_name]
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_user_prompt(type_name, text)},
    ]

    json_schema = {
        "type": "object",
        "properties": {
            "predicate": {"type": "string", "enum": allowed},
            "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
            "rationale": {"type": "string", "maxLength": 200},
        },
        "required": ["predicate", "confidence", "rationale"],
        "additionalProperties": False,
    }

    backoff, max_backoff = 1.0, 30.0
    while True:
        try:
            # 1) tools (newer function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    tools=[{"type":"function","function":{"name":"set_predicate","description":"label","parameters":json_schema}}],
                    tool_choice={"type":"function","function":{"name":"set_predicate"}},
                )
                msg = resp.choices[0].message
                if getattr(msg, "tool_calls", None):
                    args = json.loads(msg.tool_calls[0].function.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 2) functions (older function-calling)
            try:
                resp = client.chat.completions.create(
                    model=model, messages=messages, temperature=0.0,
                    functions=[{"name":"set_predicate","description":"label","parameters":json_schema}],
                    function_call={"name":"set_predicate"},
                )
                msg = resp.choices[0].message
                if getattr(msg, "function_call", None):
                    args = json.loads(msg.function_call.arguments)
                    return _coerce_result(args, allowed)["predicate"]
            except TypeError:
                pass  # fallback

            # 3) plain completion → parse JSON
            resp = client.chat.completions.create(model=model, messages=messages, temperature=0.0)
            text_out = resp.choices[0].message.content or ""
            text_out = re.sub(r"^```(?:json)?\s*|\s*```$", "", text_out, flags=re.I|re.M).strip()
            try:
                data = json.loads(text_out)
            except Exception:
                m = re.search(r"predicate\s*[:=]\s*([^\n\r]+)", text_out, flags=re.I)
                data = {"predicate": (m.group(1).strip() if m else ""), "confidence": 0.5, "rationale": text_out[:200]}
            return _coerce_result(data, allowed)["predicate"]

        except Exception as e:
            if any(s in str(e).lower() for s in ["rate","timeout","overloaded","temporarily","503","502"]):
                time.sleep(backoff); backoff = min(max_backoff, backoff*2); continue
            raise

# -------- Main (keeps ALL rows; ND is fixed to 'None'; cache reuse) --------
df = pd.read_csv(INPUT_CSV)
df = df[df["Type"].isin(CANONICAL_TYPES)].copy()  # safety
df = df.reset_index(drop=True)

cache = {}  # (Type, String) -> predicate
preds = []

for _, row in df.iterrows():
    t = str(row["Type"]).strip()
    s = str(row["String"]).strip()

    if t == "Not Dark Pattern":
        preds.append("None")  # no API call
        continue

    key = (t, s)
    if key in cache:
        preds.append(cache[key])
        continue

    pred = call_openai(t, s, MODEL)
    if pred not in PREDICATES[t]:
        pred = PREDICATES[t][0]  # hard guard (never changes Type)
    cache[key] = pred
    preds.append(pred)

# ✅ 최종 CSV에는 'predicate' 한 컬럼만 저장
out = pd.DataFrame({"predicate": preds})

os.makedirs(os.path.dirname(OUT_PATH) or ".", exist_ok=True)
out.to_csv(OUT_PATH, index=False, encoding="utf-8")

print(f"[OK] saved → {OUT_PATH}")
print("Counts by predicate:\n", out["predicate"].value_counts(dropna=False))


[OK] saved → ../data/processed/paraphrase_predicate.csv
Counts by predicate:
 predicate
None                                1000
Activity Notifications               499
Low-stock Messages                   462
Confirmshaming                       341
Countdown Timers                     280
Limited-time Messages                220
Pressured Selling                    103
Trick Questions                       56
High-demand Messages                  38
Testimonials of Uncertain Origin       1
Name: count, dtype: int64
