b77 full

In [None]:
import pandas as pd
import openai
import json
import re
import random
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Load few-shot support file
with open("/home/xrspace/Llama/Thesis/reduce/fs_fewshot_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

# Organize few-shot examples by label
label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Set OpenAI API key
openai.api_key = "key"

# Define full Banking77 label set
clinc150_labels = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_acceptance", "card_arrival", "card_delivery_estimate",
    "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised",
    "change_pin", "chargeback", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app",
    "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card",
    "getting_virtual_card", "getting_spare_card", "getting_new_card", "how_long_to_receive_card",
    "how_long_to_receive_pin", "atm_fees", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card",
    "passcode_forgotten", "passcode_not_working", "pending_card_payment", "pending_cash_withdrawal",
    "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?",
    "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted",
    "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity",
    "verify_source_of_funds", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity",
    "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"
]

# Build prompt using dynamic few-shot samples (capped at 40 total examples)
def build_prompt(user_input: str, topn_labels: List[str]) -> str:
    formatted_options = "\n".join(f"- {label.replace('_', ' ')}" for label in topn_labels)
    few_shots = []
    MAX_FEWSHOT = 40

    for label in topn_labels:
        if label in label_to_example:
            examples = label_to_example[label]
            sampled = random.sample(examples, min(3, len(examples)))  # Up to 3 per label
            for ex in sampled:
                few_shots.append(f"""Example:
Query: "{ex['text']}"
Roles: {ex['entities']}
Reasoning: {ex['explanation']}
Label: {label}
---""")
                if len(few_shots) >= MAX_FEWSHOT:
                    break
        if len(few_shots) >= MAX_FEWSHOT:
            break

    few_shot_block = "\n".join(few_shots)
    return f"""You are an intelligent assistant for intent classification in banking queries.
{few_shot_block}
Instructions:
1. Extract the semantic elements from the user query as structured roles:
   - action (what is being done)
   - object (what it’s done to)
   - target (who or what is affected)
   - temporal (time-related references, if any)
   - intent_clue (phrases showing intent)
2. Use the extracted roles to reason about how the query aligns with each of the options below.
3. For each option, explain why it does or does not match the extracted semantics.
4. Finally, choose the most appropriate intent label from the given options.
5. Return final decision in the original snake_case label.
OPTIONS:
{formatted_options}
---
QUERY: {user_input}
Respond in the following format:
```json
{{
  "roles": {{
    "action": "...",
    "object": "...",
    "target": "...",
    "temporal": "...",
    "intent_clue": "..."
  }},
  "reasoning": "...",
  "label": "..."
}}
```"""

# Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant specializing in intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=600
    )
    return response.choices[0].message.content

# Parse structured output
def parse_response(response: str) -> Dict[str, str]:
    try:
        json_part = re.search(r"\{.*\}", response, re.DOTALL).group(0)
        parsed_json = json.loads(json_part)
        roles = parsed_json.get("roles", {})
        explanation = parsed_json.get("reasoning", "N/A")
        predicted_label = parsed_json.get("label", "Unknown")
        return {
            "entities": json.dumps(roles, ensure_ascii=False),
            "explanation": explanation,
            "predicted_label": predicted_label
        }
    except Exception as e:
        print("Parsing error:", e)
        return {"entities": "N/A", "explanation": "N/A", "predicted_label": "Unknown"}

# Main evaluation pipeline
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row["text"]
        true_label = row["label"]
        prompt = build_prompt(text, clinc150_labels)  # Full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        results.append({
            "id": int(row["id"]),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": clinc150_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        })
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to: {output_json}")
    return results

# Accuracy check
def evaluate_accuracy(results: List[Dict], mistake_file: str = "e2i_macc_b77_mistakes-turbo.json"):
    total = len(results)
    correct = sum(1 for r in results if r["true_label"].strip() == r["predicted_label"].strip())
    accuracy = correct / total if total else 0
    mistakes = [r for r in results if r["true_label"].strip() != r["predicted_label"].strip()]
    print(f"\n🔍 Accuracy: {accuracy*100:.2f}%")
    print(f"Mistakes: {len(mistakes)} / {total}")
    if mistakes:
        with open(mistake_file, "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"Mistakes saved to: {mistake_file}")
    else:
        print("🎉 No mistakes! Perfect predictions.")
    return accuracy

# Entry point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/maccot_topk_formatted.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_b77_gpt.json"
    mistake_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_b77_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results, mistake_file=mistake_json)

  0%|          | 0/308 [00:00<?, ?it/s]

100%|██████████| 308/308 [09:03<00:00,  1.76s/it]


✅ Predictions saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_b77_gpt.json

🔍 Accuracy: 52.92%
Mistakes: 145 / 308
Mistakes saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_b77_gpt.json





fs flat

In [6]:
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Set OpenAI API key
openai.api_key = "key"

# Load few-shot examples
with open("/home/xrspace/Llama/Thesis/reduce/fs_fewshot_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Full Banking77 label set
banking77_labels = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_acceptance", "card_arrival", "card_delivery_estimate",
    "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised",
    "change_pin", "chargeback", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app",
    "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card",
    "getting_virtual_card", "getting_spare_card", "getting_new_card", "how_long_to_receive_card",
    "how_long_to_receive_pin", "atm_fees", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card",
    "passcode_forgotten", "passcode_not_working", "pending_card_payment", "pending_cash_withdrawal",
    "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?",
    "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted",
    "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity",
    "verify_source_of_funds", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity",
    "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"
]

# 1. Build prompt using all 77 labels
def build_prompt(user_input: str) -> str:
    few_shots = []
    added_labels = set()

    for label in clinc150_labels:
        if label in label_to_example and label not in added_labels:
            ex = label_to_example[label][0]
            few_shots.append(f"SENTENCE: {ex['text']}\nLABEL: {label}")
            added_labels.add(label)
        if len(few_shots) == 2:
            break

    few_shot_block = "\n\n".join(few_shots)
    options_block = ", ".join(clinc150_labels)

    prompt = f"""Below is a text classification problem.
Note that you can only select the label in
{{{options_block}}}

{few_shot_block}

SENTENCE: {user_input}
LABEL:"""
    return prompt

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification on 10 samples
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]  # Limit to first 10
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)  # Use full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": banking77_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_mistake_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to fs_alloption_mistake_gpt.json.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_b77_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_prediction_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

100%|██████████| 308/308 [02:58<00:00,  1.72it/s]


✅ Predictions saved to /home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_prediction_gpt.json

📊 Accuracy Report
Total Samples: 308
Correct Predictions: 124
Accuracy: 40.26%

🔍 184 mistakes saved to fs_alloption_mistake_gpt.json.json





zs

In [None]:
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Set OpenAI API key
openai.api_key = "key"

# Load few-shot examples
with open("/home/xrspace/Llama/Thesis/reduce/fs_fewshot_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Full Banking77 label set
banking77_labels = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_acceptance", "card_arrival", "card_delivery_estimate",
    "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised",
    "change_pin", "chargeback", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app",
    "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card",
    "getting_virtual_card", "getting_spare_card", "getting_new_card", "how_long_to_receive_card",
    "how_long_to_receive_pin", "atm_fees", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card",
    "passcode_forgotten", "passcode_not_working", "pending_card_payment", "pending_cash_withdrawal",
    "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?",
    "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted",
    "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity",
    "verify_source_of_funds", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity",
    "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"
]

# 1. Build prompt using all 77 labels
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Set OpenAI API key
openai.api_key = "key"

# Load few-shot examples
with open("/home/xrspace/Llama/Thesis/reduce/fs_fewshot_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Full Banking77 label set
banking77_labels = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_acceptance", "card_arrival", "card_delivery_estimate",
    "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised",
    "change_pin", "chargeback", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app",
    "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card",
    "getting_virtual_card", "getting_spare_card", "getting_new_card", "how_long_to_receive_card",
    "how_long_to_receive_pin", "atm_fees", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card",
    "passcode_forgotten", "passcode_not_working", "pending_card_payment", "pending_cash_withdrawal",
    "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?",
    "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted",
    "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity",
    "verify_source_of_funds", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity",
    "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"
]

# 1. Build prompt using all 77 labels
def build_prompt(user_input: str) -> str:
    few_shots = []
    added_labels = set()

    for label in banking77_labels:
        if label in label_to_example and label not in added_labels:
            ex = label_to_example[label][0]
            few_shots.append(f"SENTENCE: {ex['text']}\nLABEL: {label}")
            added_labels.add(label)
        if len(few_shots) == 2:
            break

    few_shot_block = "\n\n".join(few_shots)
    options_block = ", ".join(banking77_labels)

    prompt = f"""Below is a text classification problem.
Note that you can only select the label in
{{{options_block}}}

{few_shot_block}

SENTENCE: {user_input}
LABEL:"""
    return prompt

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification on 10 samples
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]  # Limit to first 10
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)  # Use full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": banking77_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_mistake_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to fs_alloption_mistake_gpt.json.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_b77_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_prediction_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification on 10 samples
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]  # Limit to first 10
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)  # Use full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": banking77_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_mistake_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to fs_alloption_mistake_gpt.json.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_b77_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fs_alloption_prediction_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

In [None]:
zs

In [7]:
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict

# Set OpenAI API key
openai.api_key = "key"

# Full Banking77 label set
banking77_labels = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_acceptance", "card_arrival", "card_delivery_estimate",
    "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised",
    "change_pin", "chargeback", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app",
    "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card",
    "getting_virtual_card", "getting_spare_card", "getting_new_card", "how_long_to_receive_card",
    "how_long_to_receive_pin", "atm_fees", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card",
    "passcode_forgotten", "passcode_not_working", "pending_card_payment", "pending_cash_withdrawal",
    "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?",
    "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted",
    "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity",
    "verify_source_of_funds", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity",
    "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"
]

# 1. Build Zero-Shot prompt
def build_prompt(user_input: str) -> str:
    options_block = ", ".join(banking77_labels)
    return f"""Given the sentence: "{user_input}"
Please select the most possible topic from
the following OPTIONS: {options_block}
CHOICE:"""

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=100
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)  # ← run on all samples; or use .iloc[:10] to limit
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": banking77_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/zs_alloption_mistake_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to zs_alloption_mistake_gpt.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_b77_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/zs_alloption_prediction_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

100%|██████████| 308/308 [03:07<00:00,  1.64it/s]


✅ Predictions saved to /home/xrspace/Llama/Thesis/E2i/ablition/zs_alloption_prediction_gpt.json

📊 Accuracy Report
Total Samples: 308
Correct Predictions: 79
Accuracy: 25.65%

🔍 229 mistakes saved to zs_alloption_mistake_gpt.json





click
full option e2i

In [12]:
import pandas as pd
import openai
import json
import re
import random
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Load few-shot support file
with open("/home/xrspace/Llama/Thesis/reduce/clinc/fs_clinc_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

# Organize few-shot examples by label
label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Set OpenAI API key
openai.api_key = "key"

# Define full Banking77 label set
clinc150_labels = [
    "restaurant_reviews", "nutrition_info", "account_blocked", "oil_change_how", "time",
    "weather", "redeem_rewards", "interest_rate", "gas_type", "accept_reservations",
    "smart_home", "user_name", "report_lost_card", "repeat", "give_weather",
    "credit_limit_change", "change_language", "update_playlist", "sync_device", "schedule_maintenance",
    "what_are_your_hobbies", "book_hotel", "cancel_reservation", "change_accent", "min_payment",
    "pay_bill", "international_visa", "calendar", "exchange_rate", "flip_coin",
    "do_you_have_pets", "balance", "tell_joke", "last_maintenance", "income",
    "vaccines", "reminder_update", "order", "jump_start", "recipe",
    "meal_suggestion", "restaurant_reservation", "reset_password", "change_password", "spending_history",
    "cancel", "user_rating", "where_are_you_from", "are_you_a_bot", "make_reservation",
    "expiration_date", "routing", "insurance_change", "what_is_your_name", "thank_you",
    "shopping_list", "user_feedback", "change_speed", "plug_type", "travel_alert",
    "traffic", "travel_notification", "order_checks", "bill_balance", "improve_credit_score",
    "report_fraud", "spending_limit", "directions", "credit_score", "bill_due",
    "who_made_you", "application_status", "apr", "how_old_are_you", "are_you_real",
    "direct_deposit", "transactions", "transfer", "card_declined", "interest_rate_change",
    "report_lost_phone", "change_user_name", "current_location", "pto_request", "next_song",
    "change_volume", "travel_suggestion", "no", "maybe", "yes",
    "order_status", "confirm_reservation", "cook_time", "calendar_update", "nutrition_plan",
    "pto_request_status", "how_busy", "cancel_subscription", "exchange_currency", "payday",
    "schedule_meeting", "calories", "report_charge", "car_rental", "gas",
    "how_much", "play_music", "book_flight", "weather_umbrella", "book_holiday",
    "uber", "change_organization", "meeting_schedule", "whisper_mode", "what_song",
    "meaning_of_life", "todo_list", "card_arrival", "next_holiday", "change_name",
    "international_fees", "calendar_remove", "taxes", "vaccination_requirements", "passport_renewal",
    "timezone", "reminder", "how_many", "application_deadline", "freeze_account",
    "what_can_i_ask_you", "give_recipe", "start_plan", "goodbye", "what_is_your_gender",
    "how_do_i_make_a_payment", "book_appointment", "damaged_card", "reset_settings", "update_settings",
    "pto_balance", "cancel_appointment", "order_cancel", "cut_off", "report_outage",
    "reward_balance", "update_profile", "add_remove_device", "fun_fact", "oil_change_when",
    "travel_documents", "replacement_card_duration", "new_card", "roll_dice", "who_do_you_work_for",
    "shopping_list_update", "next_event", "change_profile_picture", "redeem_gift_card", "current_weather",
    "flight_status", "new_card_limit", "good_morning", "new_card_expires", "how_long",
    "where_is_my_refund", "definition", "bill_due_date", "who_are_you", "tell_me_about_yourself"
]

# Build prompt using dynamic few-shot samples (capped at 40 total examples)
def build_prompt(user_input: str, topn_labels: List[str]) -> str:
    formatted_options = "\n".join(f"- {label.replace('_', ' ')}" for label in topn_labels)
    few_shots = []
    MAX_FEWSHOT = 40

    for label in topn_labels:
        if label in label_to_example:
            examples = label_to_example[label]
            sampled = random.sample(examples, min(3, len(examples)))  # Up to 3 per label
            for ex in sampled:
                few_shots.append(f"""Example:
Query: "{ex['text']}"
Roles: {ex['entities']}
Reasoning: {ex['explanation']}
Label: {label}
---""")
                if len(few_shots) >= MAX_FEWSHOT:
                    break
        if len(few_shots) >= MAX_FEWSHOT:
            break

    few_shot_block = "\n".join(few_shots)
    return f"""You are an intelligent assistant for intent classification in banking queries.
{few_shot_block}
Instructions:
1. Extract the semantic elements from the user query as structured roles:
   - action (what is being done)
   - object (what it’s done to)
   - target (who or what is affected)
   - temporal (time-related references, if any)
   - intent_clue (phrases showing intent)
2. Use the extracted roles to reason about how the query aligns with each of the options below.
3. For each option, explain why it does or does not match the extracted semantics.
4. Finally, choose the most appropriate intent label from the given options.
5. Return final decision in the original snake_case label.
OPTIONS:
{formatted_options}
---
QUERY: {user_input}
Respond in the following format:
```json
{{
  "roles": {{
    "action": "...",
    "object": "...",
    "target": "...",
    "temporal": "...",
    "intent_clue": "..."
  }},
  "reasoning": "...",
  "label": "..."
}}
```"""

# Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant specializing in intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=600
    )
    return response.choices[0].message.content

# Parse structured output
def parse_response(response: str) -> Dict[str, str]:
    try:
        json_part = re.search(r"\{.*\}", response, re.DOTALL).group(0)
        parsed_json = json.loads(json_part)
        roles = parsed_json.get("roles", {})
        explanation = parsed_json.get("reasoning", "N/A")
        predicted_label = parsed_json.get("label", "Unknown")
        return {
            "entities": json.dumps(roles, ensure_ascii=False),
            "explanation": explanation,
            "predicted_label": predicted_label
        }
    except Exception as e:
        print("Parsing error:", e)
        return {"entities": "N/A", "explanation": "N/A", "predicted_label": "Unknown"}

# Main evaluation pipeline
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row["text"]
        true_label = row["label"]
        prompt = build_prompt(text, clinc150_labels)  # Full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        results.append({
            "id": int(row["id"]),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": clinc150_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        })
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to: {output_json}")
    return results

# Accuracy check
def evaluate_accuracy(results: List[Dict], mistake_file: str = "fulloption_mistake_clinc_gpt.json"):
    total = len(results)
    correct = sum(1 for r in results if r["true_label"].strip() == r["predicted_label"].strip())
    accuracy = correct / total if total else 0
    mistakes = [r for r in results if r["true_label"].strip() != r["predicted_label"].strip()]
    print(f"\n🔍 Accuracy: {accuracy*100:.2f}%")
    print(f"Mistakes: {len(mistakes)} / {total}")
    if mistakes:
        with open(mistake_file, "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"Mistakes saved to: {mistake_file}")
    else:
        print("🎉 No mistakes! Perfect predictions.")
    return accuracy

# Entry point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_formatted_clinc_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_clinc_gpt.json"
    mistake_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_clinc_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results, mistake_file=mistake_json)

100%|██████████| 225/225 [06:42<00:00,  1.79s/it]


✅ Predictions saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_clinc_gpt.json

🔍 Accuracy: 53.78%
Mistakes: 104 / 225
Mistakes saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_clinc_gpt.json





clincfsfull option

In [14]:
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Set OpenAI API key
openai.api_key = "key"

# Load few-shot examples
with open("/home/xrspace/Llama/Thesis/reduce/clinc/fs_clinc_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Full Banking77 label set
clinc150_labels = [
"restaurant_reviews", "nutrition_info", "account_blocked", "oil_change_how", "time",
    "weather", "redeem_rewards", "interest_rate", "gas_type", "accept_reservations",
    "smart_home", "user_name", "report_lost_card", "repeat", "give_weather",
    "credit_limit_change", "change_language", "update_playlist", "sync_device", "schedule_maintenance",
    "what_are_your_hobbies", "book_hotel", "cancel_reservation", "change_accent", "min_payment",
    "pay_bill", "international_visa", "calendar", "exchange_rate", "flip_coin",
    "do_you_have_pets", "balance", "tell_joke", "last_maintenance", "income",
    "vaccines", "reminder_update", "order", "jump_start", "recipe",
    "meal_suggestion", "restaurant_reservation", "reset_password", "change_password", "spending_history",
    "cancel", "user_rating", "where_are_you_from", "are_you_a_bot", "make_reservation",
    "expiration_date", "routing", "insurance_change", "what_is_your_name", "thank_you",
    "shopping_list", "user_feedback", "change_speed", "plug_type", "travel_alert",
    "traffic", "travel_notification", "order_checks", "bill_balance", "improve_credit_score",
    "report_fraud", "spending_limit", "directions", "credit_score", "bill_due",
    "who_made_you", "application_status", "apr", "how_old_are_you", "are_you_real",
    "direct_deposit", "transactions", "transfer", "card_declined", "interest_rate_change",
    "report_lost_phone", "change_user_name", "current_location", "pto_request", "next_song",
    "change_volume", "travel_suggestion", "no", "maybe", "yes",
    "order_status", "confirm_reservation", "cook_time", "calendar_update", "nutrition_plan",
    "pto_request_status", "how_busy", "cancel_subscription", "exchange_currency", "payday",
    "schedule_meeting", "calories", "report_charge", "car_rental", "gas",
    "how_much", "play_music", "book_flight", "weather_umbrella", "book_holiday",
    "uber", "change_organization", "meeting_schedule", "whisper_mode", "what_song",
    "meaning_of_life", "todo_list", "card_arrival", "next_holiday", "change_name",
    "international_fees", "calendar_remove", "taxes", "vaccination_requirements", "passport_renewal",
    "timezone", "reminder", "how_many", "application_deadline", "freeze_account",
    "what_can_i_ask_you", "give_recipe", "start_plan", "goodbye", "what_is_your_gender",
    "how_do_i_make_a_payment", "book_appointment", "damaged_card", "reset_settings", "update_settings",
    "pto_balance", "cancel_appointment", "order_cancel", "cut_off", "report_outage",
    "reward_balance", "update_profile", "add_remove_device", "fun_fact", "oil_change_when",
    "travel_documents", "replacement_card_duration", "new_card", "roll_dice", "who_do_you_work_for",
    "shopping_list_update", "next_event", "change_profile_picture", "redeem_gift_card", "current_weather",
    "flight_status", "new_card_limit", "good_morning", "new_card_expires", "how_long",
    "where_is_my_refund", "definition", "bill_due_date", "who_are_you", "tell_me_about_yourself"
]

# 1. Build prompt using all 77 labels
def build_prompt(user_input: str) -> str:
    few_shots = []
    added_labels = set()

    for label in clinc150_labels:
        if label in label_to_example and label not in added_labels:
            ex = label_to_example[label][0]
            few_shots.append(f"SENTENCE: {ex['text']}\nLABEL: {label}")
            added_labels.add(label)
        if len(few_shots) == 2:
            break

    few_shot_block = "\n\n".join(few_shots)
    options_block = ", ".join(clinc150_labels)

    prompt = f"""Below is a text classification problem.
Note that you can only select the label in
{{{options_block}}}

{few_shot_block}

SENTENCE: {user_input}
LABEL:"""
    return prompt

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification on 10 samples
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)#.iloc[:10]  # Limit to first 10
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)  # Use full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": clinc150_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/fs_fulloption_mistake_clinc_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to fs_alloption_mistake_gpt.json.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_formatted_clinc_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fs_fulloption_predict_clinc_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

100%|██████████| 225/225 [02:01<00:00,  1.85it/s]


✅ Predictions saved to /home/xrspace/Llama/Thesis/E2i/ablition/fs_fulloption_predict_clinc_gpt.json

📊 Accuracy Report
Total Samples: 225
Correct Predictions: 86
Accuracy: 38.22%

🔍 139 mistakes saved to fs_alloption_mistake_gpt.json.json





zs clinc

In [15]:
import pandas as pd
import openai
import json
import re
from tqdm import tqdm
from typing import List, Dict

# Set OpenAI API key
openai.api_key = "key"

# Full Banking77 label set
clinc150_labels = [
    "restaurant_reviews", "nutrition_info", "account_blocked", "oil_change_how", "time",
    "weather", "redeem_rewards", "interest_rate", "gas_type", "accept_reservations",
    "smart_home", "user_name", "report_lost_card", "repeat", "give_weather",
    "credit_limit_change", "change_language", "update_playlist", "sync_device", "schedule_maintenance",
    "what_are_your_hobbies", "book_hotel", "cancel_reservation", "change_accent", "min_payment",
    "pay_bill", "international_visa", "calendar", "exchange_rate", "flip_coin",
    "do_you_have_pets", "balance", "tell_joke", "last_maintenance", "income",
    "vaccines", "reminder_update", "order", "jump_start", "recipe",
    "meal_suggestion", "restaurant_reservation", "reset_password", "change_password", "spending_history",
    "cancel", "user_rating", "where_are_you_from", "are_you_a_bot", "make_reservation",
    "expiration_date", "routing", "insurance_change", "what_is_your_name", "thank_you",
    "shopping_list", "user_feedback", "change_speed", "plug_type", "travel_alert",
    "traffic", "travel_notification", "order_checks", "bill_balance", "improve_credit_score",
    "report_fraud", "spending_limit", "directions", "credit_score", "bill_due",
    "who_made_you", "application_status", "apr", "how_old_are_you", "are_you_real",
    "direct_deposit", "transactions", "transfer", "card_declined", "interest_rate_change",
    "report_lost_phone", "change_user_name", "current_location", "pto_request", "next_song",
    "change_volume", "travel_suggestion", "no", "maybe", "yes",
    "order_status", "confirm_reservation", "cook_time", "calendar_update", "nutrition_plan",
    "pto_request_status", "how_busy", "cancel_subscription", "exchange_currency", "payday",
    "schedule_meeting", "calories", "report_charge", "car_rental", "gas",
    "how_much", "play_music", "book_flight", "weather_umbrella", "book_holiday",
    "uber", "change_organization", "meeting_schedule", "whisper_mode", "what_song",
    "meaning_of_life", "todo_list", "card_arrival", "next_holiday", "change_name",
    "international_fees", "calendar_remove", "taxes", "vaccination_requirements", "passport_renewal",
    "timezone", "reminder", "how_many", "application_deadline", "freeze_account",
    "what_can_i_ask_you", "give_recipe", "start_plan", "goodbye", "what_is_your_gender",
    "how_do_i_make_a_payment", "book_appointment", "damaged_card", "reset_settings", "update_settings",
    "pto_balance", "cancel_appointment", "order_cancel", "cut_off", "report_outage",
    "reward_balance", "update_profile", "add_remove_device", "fun_fact", "oil_change_when",
    "travel_documents", "replacement_card_duration", "new_card", "roll_dice", "who_do_you_work_for",
    "shopping_list_update", "next_event", "change_profile_picture", "redeem_gift_card", "current_weather",
    "flight_status", "new_card_limit", "good_morning", "new_card_expires", "how_long",
    "where_is_my_refund", "definition", "bill_due_date", "who_are_you", "tell_me_about_yourself"
]

# 1. Build Zero-Shot prompt
def build_prompt(user_input: str) -> str:
    options_block = ", ".join(clinc150_labels)
    return f"""Given the sentence: "{user_input}"
Please select the most possible topic from
the following OPTIONS: {options_block}
CHOICE:"""

# 2. Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant for intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=100
    )
    return response.choices[0].message.content

# 3. Parse response
def parse_response(response: str) -> Dict[str, str]:
    return {
        "entities": "N/A",
        "explanation": "N/A",
        "predicted_label": response.strip().split("\n")[0]
    }

# 4. Run classification
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv)  # ← run on all samples; or use .iloc[:10] to limit
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        true_label = row['label']
        prompt = build_prompt(text)
        response = query_model(prompt)
        parsed = parse_response(response)
        result = {
            "id": int(row['id']),
            "text": text,
            "true_label": true_label,
            "top_k_predictions": clinc150_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        }
        results.append(result)
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to {output_json}")
    return results

# 5. Evaluate Accuracy
def evaluate_accuracy(results: List[Dict]):
    total = len(results)
    correct = sum(1 for r in results if r['true_label'].strip() == r['predicted_label'].strip())
    accuracy = correct / total if total else 0
    print("\n📊 Accuracy Report")
    print(f"Total Samples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    mistakes = [r for r in results if r['true_label'].strip() != r['predicted_label'].strip()]
    if mistakes:
        with open("/home/xrspace/Llama/Thesis/E2i/ablition/zs_alloption_mistake_gpt.json", "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"\n🔍 {len(mistakes)} mistakes saved to zs_alloption_mistake_gpt.json")
    return accuracy

# 6. Entry Point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_formatted_clinc_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/zs_fulloption_predict_clinc_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results)

100%|██████████| 225/225 [02:47<00:00,  1.34it/s]


✅ Predictions saved to /home/xrspace/Llama/Thesis/E2i/ablition/zs_fulloption_predict_clinc_gpt.json

📊 Accuracy Report
Total Samples: 225
Correct Predictions: 54
Accuracy: 24.00%

🔍 171 mistakes saved to zs_alloption_mistake_gpt.json





liu54

In [18]:
import pandas as pd
import openai
import json
import re
import random
from tqdm import tqdm
from typing import List, Dict
from collections import defaultdict

# Load few-shot support file
with open("/home/xrspace/Llama/Thesis/reduce/liu54/fs_liu_samples_gpt4.json", "r", encoding="utf-8") as f:
    FEWSHOT_DB = json.load(f)

# Organize few-shot examples by label
label_to_example = defaultdict(list)
for entry in FEWSHOT_DB:
    label_to_example[entry["true_label"]].append(entry)

# Set OpenAI API key
openai.api_key = "key"

# Define full Banking77 label set
liu_54_labels = [
    "alarm_set", "alarm_query", "alarm_remove", "reminder_set", "reminder_query", "reminder_remove",
    "timer_set", "timer_query", "timer_remove",
    "calendar_set", "calendar_query", "calendar_remove",
    "weather_query",
    "music_play", "music_query", "music_pause", "music_resume", "music_next", "music_previous", "music_repeat", "music_shuffle", "music_volume_up", "music_volume_down",
    "audiobook_play", "audiobook_pause", "audiobook_resume", "audiobook_next", "audiobook_previous",
    "podcast_play", "podcast_pause", "podcast_resume", "podcast_next", "podcast_previous",
    "news_query",
    "traffic_query", "navigation_set", "navigation_cancel",
    "local_event_query", "local_place_query", "local_time_query",
    "date_time_query",
    "general_greeting", "general_ask", "general_thanks", "general_goodbye",
    "iot_cleaning", "iot_cooking", "iot_coffee", "iot_temperature_set", "iot_temperature_query", "iot_light_set", "iot_light_query"
]

# Build prompt using dynamic few-shot samples (capped at 40 total examples)
def build_prompt(user_input: str, topn_labels: List[str]) -> str:
    formatted_options = "\n".join(f"- {label.replace('_', ' ')}" for label in topn_labels)
    few_shots = []
    MAX_FEWSHOT = 40

    for label in topn_labels:
        if label in label_to_example:
            examples = label_to_example[label]
            sampled = random.sample(examples, min(3, len(examples)))  # Up to 3 per label
            for ex in sampled:
                few_shots.append(f"""Example:
Query: "{ex['text']}"
Roles: {ex['entities']}
Reasoning: {ex['explanation']}
Label: {label}
---""")
                if len(few_shots) >= MAX_FEWSHOT:
                    break
        if len(few_shots) >= MAX_FEWSHOT:
            break

    few_shot_block = "\n".join(few_shots)
    return f"""You are an intelligent assistant for intent classification in banking queries.
{few_shot_block}
Instructions:
1. Extract the semantic elements from the user query as structured roles:
   - action (what is being done)
   - object (what it’s done to)
   - target (who or what is affected)
   - temporal (time-related references, if any)
   - intent_clue (phrases showing intent)
2. Use the extracted roles to reason about how the query aligns with each of the options below.
3. For each option, explain why it does or does not match the extracted semantics.
4. Finally, choose the most appropriate intent label from the given options.
5. Return final decision in the original snake_case label.
OPTIONS:
{formatted_options}
---
QUERY: {user_input}
Respond in the following format:
```json
{{
  "roles": {{
    "action": "...",
    "object": "...",
    "target": "...",
    "temporal": "...",
    "intent_clue": "..."
  }},
  "reasoning": "...",
  "label": "..."
}}
```"""

# Query GPT model
def query_model(prompt: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant specializing in intent classification."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=600
    )
    return response.choices[0].message.content

# Parse structured output
def parse_response(response: str) -> Dict[str, str]:
    try:
        json_part = re.search(r"\{.*\}", response, re.DOTALL).group(0)
        parsed_json = json.loads(json_part)
        roles = parsed_json.get("roles", {})
        explanation = parsed_json.get("reasoning", "N/A")
        predicted_label = parsed_json.get("label", "Unknown")
        return {
            "entities": json.dumps(roles, ensure_ascii=False),
            "explanation": explanation,
            "predicted_label": predicted_label
        }
    except Exception as e:
        print("Parsing error:", e)
        return {"entities": "N/A", "explanation": "N/A", "predicted_label": "Unknown"}

# Main evaluation pipeline
def run_ner_cot(input_csv: str, output_json: str):
    df = pd.read_csv(input_csv).iloc[:60]
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = row["text"]
        true_label = row["label"]
        prompt = build_prompt(text, liu_54_labels)  # Full label set
        response = query_model(prompt)
        parsed = parse_response(response)
        results.append({
            "id": int(row["id"]),
            "text": text,
            "true_label": true_label,
            "top_k_predictions":liu_54_labels,
            "entities": parsed["entities"],
            "explanation": parsed["explanation"],
            "predicted_label": parsed["predicted_label"]
        })
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Predictions saved to: {output_json}")
    return results

# Accuracy check
def evaluate_accuracy(results: List[Dict], mistake_file: str = "fulloption_mistake_liu_gpt.json"):
    total = len(results)
    correct = sum(1 for r in results if r["true_label"].strip() == r["predicted_label"].strip())
    accuracy = correct / total if total else 0
    mistakes = [r for r in results if r["true_label"].strip() != r["predicted_label"].strip()]
    print(f"\n🔍 Accuracy: {accuracy*100:.2f}%")
    print(f"Mistakes: {len(mistakes)} / {total}")
    if mistakes:
        with open(mistake_file, "w", encoding="utf-8") as f:
            json.dump(mistakes, f, indent=2, ensure_ascii=False)
        print(f"Mistakes saved to: {mistake_file}")
    else:
        print("🎉 No mistakes! Perfect predictions.")
    return accuracy

# Entry point
if __name__ == "__main__":
    input_csv = "/home/xrspace/Llama/Thesis/E2i/ablition/maccot_formatted_liu_no_topk.csv"
    output_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_liu_gpt.json"
    mistake_json = "/home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_liu_gpt.json"
    results = run_ner_cot(input_csv, output_json)
    evaluate_accuracy(results, mistake_file=mistake_json)

100%|██████████| 60/60 [01:31<00:00,  1.53s/it]


✅ Predictions saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_predict_liu_gpt.json

🔍 Accuracy: 0.00%
Mistakes: 60 / 60
Mistakes saved to: /home/xrspace/Llama/Thesis/E2i/ablition/fulloption_mistake_liu_gpt.json



