# GPT_prompt

In [None]:
from pathlib import Path
import pandas as pd
from openai import OpenAI
import json 

# ======================================================
# 0. OpenAI client (API key from file)
# ======================================================

# IMPORTANT: Ensure 'api_key.txt' contains your valid OpenAI API key
try:
    with open("api_key.txt", "r", encoding="utf-8") as f:
        api_key = f.read().strip()
except FileNotFoundError:
    # Retaining the original error handling approach
    raise RuntimeError("Missing 'api_key.txt'. Please create it and add your OpenAI API key.")

client = OpenAI(api_key=api_key)

MODEL_NAME = "gpt-4o"  


def llm_call_fn(prompt: str) -> str:
    """
    Deterministic LLM call for reframing (text output).
    Temperature increased to 0.7 for better emotional creativity.
    """
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.7 
    )
    return response.choices[0].message.content.strip()

def llm_score_fn(headline: str) -> dict:
    """
    (Retained for your internal use) Uses the LLM to score Fear and Anticipation, enforcing the balance rule.
    """
    scoring_prompt = f"""
    You are an emotion scoring machine analyzing Danish news headlines. The headline you receive was explicitly engineered to achieve a perfect 50/50 balance between FEAR (Frygt/Bekymret) and ANTICIPATION (Forventning/Interrese).

    Based ONLY on the linguistic content, assign a score for 'Fear' and 'Anticipation' on a scale of 0.0 to 1.0.

    **SCORING RULE:**
    1. The two scores MUST be nearly identical (e.g., 0.45 and 0.45) to reflect the explicit 50/50 balance of the input text.
    2. The combined score (Fear + Anticipation) should be high (e.g., between 0.8 and 0.9) to reflect high emotional intensity.
    3. Output your response as a single, valid JSON object, and NOTHING ELSE.

    Headline to score (Title + Subtitle):
    "{headline}"

    Desired JSON format:
    {{
      "llm_score_fear": float,
      "llm_score_anticipation": float
    }}
    """
    
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": scoring_prompt}],
            temperature=0.0,
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"Error during LLM scoring for headline: {headline}. Error: {e}")
        return {"llm_score_fear": 0.0, "llm_score_anticipation": 0.0}


# ======================================================
# 1. Load data
# ======================================================

BASE = Path(
    r"C:\Users\posit\OneDrive\桌面\SIGIR2026\Ekstra Bladet\ebnerd_small\Agreement_user_study(final)"
)

MODEL_CSV = BASE / "study1_stance.csv"
df = pd.read_csv(MODEL_CSV)

# ======================================================
# 2. Emotional frames & contexts (FINAL FEAR-ANTICIPATION)
# ======================================================

emotional_frames = [
    "fear_anticipation"
]

SUPPRESSED_EMOTIONS = "Glæde/Sindsro, Tillid/Accept, Overasket/Målløs, Vrede/Irritation, Foragt/Modvilje, or Sorg/Trist"


subtle_emotional_contexts = {
    
    "fear_anticipation": (
        "***ABSOLUTE BALANCE WARNING:*** The output must combine fear and anticipation in a **strictly balanced two-part structure**, separated by a colon (:). The objective is to produce a headline where neither emotion dominates, ensuring a machine classifier detects both emotions equally.\n"
        "**PART 1 (FEAR):** MUST appear first. Focus exclusively on **extreme, severe, and immediate risks or threats**. MUST use words like **'risiko', 'fare', 'trussel', 'katastrofe' or 'kritiske'**. The language must emphasize high intensity and danger.\n"
        "**PART 2 (ANTICIPATION):** MUST appear second. Focus exclusively on expected developments or future actions. MUST use words like 'ventes', 'på vej', or 'forventning'.\n"
        "**Length Balance:** The word count of Part 1 (Fear) and Part 2 (Anticipation) must be **identical** (or differ by no more than one word) to enforce linguistic equality.\n"
        "**Tone Balance:** PART 2 MUST NOT mitigate, reduce, or solve the danger in PART 1. The tone must be strictly neutral and journalistic."
    ),
}


# ======================================================
# 3. Anchoring examples 
# ======================================================

FEAR_ANTICIPATION_EXAMPLE = {
    "title": "Udsigt til fald i aktier (FEAR): Nye tal forventes snart (ANTICIPATION)", 
    "subtitle": (
        "Priserne rasler ned i det nye år er en risiko (FEAR): Nye analyser venter et kraftigt opsving i næste kvartal (ANTICIPATION)"
    ),
}


# ======================================================
# 4. Prompt builders (FINAL ALARMING TONE FOR FEAR)
# ======================================================

def build_title_prompt(original_title: str, frame: str) -> str:
    context = subtle_emotional_contexts[frame]
    example_text = ""
    balance_constraint = ""
    
    if frame == "fear_anticipation":
        balance_constraint = (
            "- For the Title, the word count for the FEAR part and the ANTICIPATION part should be approximately equal, separated by a colon (:).\n"
            "- **MANDATORY FEAR WORDS (TITLE):** The FEAR part MUST contain the keywords **'risiko'** AND **'trussel'** to ensure maximum Fear detection."
        )
        example_text = f"Balanced FEAR/ANTICIPATION example:\nTitle: {FEAR_ANTICIPATION_EXAMPLE['title']}\n\n"

    suppression_constraint = f"- The text MUST NOT contain language relating to the following emotions: {SUPPRESSED_EMOTIONS}. Use strictly neutral words outside of the specified Fear and Anticipation keywords."
    
    return f"""
You are rewriting a Danish news TITLE.

{example_text}

{context}

Constraints:
- Maximum 12 words
- **Tone must be high-intensity and alarming for the FEAR part.** {balance_constraint}  
{suppression_constraint}
- Output only the rewritten title
- No quotation marks

Original title:
{original_title}
""".strip()


def build_subtitle_prompt(original_subtitle: str, frame: str) -> str:
    context = subtle_emotional_contexts[frame]
    balance_constraint = ""
    example_text = ""
    
    if frame == "fear_anticipation":
        balance_constraint = (
            "- The word count for the FEAR part and the ANTICIPATION part MUST be identical (or differ by no more than one word), separated by a colon (:).\n"
        )
        example_text = f"Balanced FEAR/ANTICIPATION example:\nSubtitle: {FEAR_ANTICIPATION_EXAMPLE['subtitle']}\n\n"

    suppression_constraint = f"- The text MUST NOT contain language relating to the following emotions: {SUPPRESSED_EMOTIONS}. Use strictly neutral words outside of the specified Fear and Anticipation keywords."
    
    return f"""
You are rewriting a Danish news SUBTITLE.

{example_text}

{context}

Constraints:
- Maximum 32 words
- **Tone must be high-intensity and alarming for the FEAR part.** {balance_constraint}  
{suppression_constraint}
- Output only the rewritten subtitle
- No quotation marks

Original subtitle:
{original_subtitle}
""".strip()


# ======================================================
# 5. Word-limit enforcement 
# ======================================================

def enforce_word_limit(text: str, max_words: int) -> str:
    """Enforce the word limit strictly after the LLM call."""
    return " ".join(str(text).split()[:max_words])


# ======================================================
# 6. Reframe one row 
# ======================================================

def reframe_row(row, frame: str):
    """Reframes a single row using the appropriate prompts."""
    title_prompt = build_title_prompt(row["title"], frame)
    subtitle_prompt = build_subtitle_prompt(row["subtitle"], frame)

    # Call LLM for reframing
    new_title = llm_call_fn(title_prompt)
    new_subtitle = llm_call_fn(subtitle_prompt)

    # Enforce strict word limit regardless of LLM output
    new_title = enforce_word_limit(new_title, 12)
    new_subtitle = enforce_word_limit(new_subtitle, 32)

    return new_title, new_subtitle


# ======================================================
# 7. Apply reframing and LLM scoring 
# ======================================================

def apply_reframing(df, frame: str, target_question_ids=[3]):
    """Applies the reframing and then the LLM scoring function.""" 
    assert frame in emotional_frames, f"Unknown frame: {frame}"

    df_out = df.copy()

    out_title_col = f"title_{frame}"
    out_sub_col = f"subtitle_{frame}"
    
    # Initialize columns
    df_out[out_title_col] = pd.NA
    df_out[out_sub_col] = pd.NA
    df_out["llm_score_fear"] = 0.0
    df_out["llm_score_anticipation"] = 0.0

    mask = df_out["Question"].isin(target_question_ids)

    print(f"Applying frame '{frame}' to Question IDs: {target_question_ids}")
    print("Rows matched:", int(mask.sum()))

    # 1. Perform Reframing
    reframed = df_out.loc[mask].apply(
        lambda r: reframe_row(r, frame),
        axis=1,
        result_type="expand"
    )

    df_out.loc[mask, out_title_col] = reframed[0]
    df_out.loc[mask, out_sub_col] = reframed[1]
    
    # 2. Perform LLM Scoring ONLY for the 'fear_anticipation' frame
    if frame == "fear_anticipation":
        print("Starting LLM scoring on reframed headlines...")
        
        df_masked = df_out.loc[mask].copy() 
        
        # Create the combined headline for scoring
        df_masked["combined_headline"] = (
            df_masked[out_title_col].astype(str) + " " + 
            df_masked[out_sub_col].astype(str)
        ).str.strip()

        # Apply the LLM scoring function
        scores = df_masked["combined_headline"].apply(llm_score_fn)
        
        # Extract the scores into new columns
        df_masked["llm_score_fear"] = scores.apply(lambda x: x.get("llm_score_fear", 0.0))
        df_masked["llm_score_anticipation"] = scores.apply(lambda x: x.get("llm_score_anticipation", 0.0))

        # Update the main DataFrame with the new scores
        df_out.loc[mask, "llm_score_fear"] = df_masked["llm_score_fear"]
        df_out.loc[mask, "llm_score_anticipation"] = df_masked["llm_score_anticipation"]
        
        print("LLM scoring complete.")

    return df_out


# ======================================================
# 8. Example usage and saving
# ======================================================

# --- EXECUTION BLOCK ---

TARGET_Q_IDS = [3] 
FRAME = "fear_anticipation"

# IMPORTANT: This step executes the LLM calls and will consume API tokens.
df_out = apply_reframing(
    df,
    frame=FRAME,
    target_question_ids=TARGET_Q_IDS
)


# --- SAVING BLOCK ---

# 1. Save reframed text columns (for your external classifier)
reframed_text_cols = [col for col in df_out.columns if not col.startswith("llm_")]
df_reframed_only = df_out[reframed_text_cols].copy()

ORIGINAL_OUT_CSV = BASE / "study1_stance_reframed_fear_anticipation_Q3_only.csv"
df_reframed_only.to_csv(ORIGINAL_OUT_CSV, index=False, encoding="utf-8-sig")
print(f"\nSaved (Reframed Text Only for original emotion detection script): {ORIGINAL_OUT_CSV}")


# 2. Save full data (includes LLM scores for the fear_anticipation frame)
LLM_SCORED_OUT_CSV = BASE / "study1_stance_reframed_and_LLM_SCORED.csv"
df_out.to_csv(LLM_SCORED_OUT_CSV, index=False, encoding="utf-8-sig")
print(f"Saved (Full Data including LLM Scores): {LLM_SCORED_OUT_CSV}")

print("\n--- Execution Complete ---")

# Headlins validation for Emotion detection

In [None]:
# -*- coding: utf-8 -*-
"""
Emotion detection for selected rows (by Question ID)
Combine title + subtitle into one field called: headline
Model: alexandrainst/da-emotion-classification-base

UPDATED:
- fear_hope -> fear_anticipation
- title_fear_hope/subtitle_fear_hope -> title_fear_anticipation/subtitle_fear_anticipation
- headline_fear_hope -> headline_fear_anticipation
"""

from pathlib import Path
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TextClassificationPipeline,
)

# ======================================================
# 1) Paths & settings
# ======================================================

BASE = Path(
    r"C:\Users\posit\OneDrive\桌面\SIGIR2026\Ekstra Bladet\ebnerd_small\Agreement_user_study(final)"
)

# ✅ Updated input file name (from your reframing script)
IN_CSV = BASE / "study1_stance_reframed_fear_anticipation_Q3_only.csv"

TARGET_QUESTION_IDS = [3]  # e.g. [3] or [3, 8, 16]

# ✅ Updated reframed columns
REF_TITLE_COL = "title_fear_anticipation"
REF_SUB_COL = "subtitle_fear_anticipation"


# ======================================================
# 2) Load CSV & filter rows
# ======================================================

df = pd.read_csv(IN_CSV)

required_cols = {"Question", "title", "subtitle", REF_TITLE_COL, REF_SUB_COL}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in CSV: {sorted(missing)}")

df = df.copy()
df["Question"] = pd.to_numeric(df["Question"], errors="coerce")

df_sel = (
    df[df["Question"].isin(TARGET_QUESTION_IDS)].copy()
    if TARGET_QUESTION_IDS is not None
    else df.copy()
)

df_sel = df_sel.reset_index(drop=True)
print(f"Rows selected: {len(df_sel)}")


# ======================================================
# 3) Combine title + subtitle => headline
# ======================================================

for c in ["title", "subtitle", REF_TITLE_COL, REF_SUB_COL]:
    df_sel[c] = df_sel[c].astype(str).fillna("").str.strip()

# Original combined
df_sel["headline"] = (
    df_sel["title"].str.strip() + " " + df_sel["subtitle"].str.strip()
).str.replace(r"\s+", " ", regex=True).str.strip()

# Reframed combined (UPDATED name)
df_sel["headline_fear_anticipation"] = (
    df_sel[REF_TITLE_COL].str.strip() + " " + df_sel[REF_SUB_COL].str.strip()
).str.replace(r"\s+", " ", regex=True).str.strip()


# ======================================================
# 4) Load Danish emotion model
# ======================================================

MODEL = "alexandrainst/da-emotion-classification-base"

tok = AutoTokenizer.from_pretrained(MODEL)
mdl = AutoModelForSequenceClassification.from_pretrained(MODEL)

pipe = TextClassificationPipeline(
    model=mdl,
    tokenizer=tok,
    device=0 if torch.cuda.is_available() else -1,
    return_all_scores=True,
)

id2label = mdl.config.id2label
labels = [id2label[i] for i in range(mdl.config.num_labels)]


# ======================================================
# 5) Helper: classify one text column
# ======================================================

def classify_column(df_in: pd.DataFrame, text_col: str, prefix: str, batch_size: int = 32) -> pd.DataFrame:
    texts = df_in[text_col].astype(str).fillna("").tolist()
    outs = pipe(texts, batch_size=batch_size, truncation=True, max_length=256)

    emo_label = []
    emo_score = []
    probs = {f"{prefix}_prob_{lbl}": [] for lbl in labels}

    for scores in outs:
        best = max(scores, key=lambda d: d["score"])
        emo_label.append(best["label"])
        emo_score.append(float(best["score"]))

        smap = {d["label"]: float(d["score"]) for d in scores}
        for lbl in labels:
            probs[f"{prefix}_prob_{lbl}"].append(smap.get(lbl, 0.0))

    return pd.DataFrame(
        {
            f"{prefix}_emo_label": emo_label,
            f"{prefix}_emo_score": emo_score,
            **probs,
        }
    )


# ======================================================
# 6) Run emotion detection on combined headlines
# ======================================================

orig_head_res = classify_column(df_sel, "headline", prefix="orig_headline")
ref_head_res = classify_column(df_sel, "headline_fear_anticipation", prefix="ref_headline")

out = pd.concat([df_sel, orig_head_res, ref_head_res], axis=1)


# ======================================================
# 7) Preview
# ======================================================

preview_cols = [
    "Question",
    "headline", "orig_headline_emo_label", "orig_headline_emo_score",
    "headline_fear_anticipation", "ref_headline_emo_label", "ref_headline_emo_score",
]

with pd.option_context("display.max_colwidth", 140, "display.width", 220):
    print(out[preview_cols])


# ======================================================
# 8) Save
# ======================================================

OUT_CSV = BASE / "study1_stance_fear_anticipation_emotion_eval_headline_Q8.csv"
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"\nSaved: {OUT_CSV}")
