# LLM-as-a-Judge (Qwen2.5-7B-Instruct)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

import torch
import requests  # Assuming API access via HTTP
import os
from openai import OpenAI

### Load data

In [10]:
result_df = pd.read_json('result/llama_N32.json')

In [None]:
def judge_response(text, actual_type, reasoning):

    prompt = f"""
    Please evaluate this MBTI prediction response by providing:

    1. Scores (1-5 scale) for:
        - Trait Consistency: /5 (alignment with MBTI theory)
        - Evidence Quality: /5 (psychological validity of examples)
        - Logical Flow: /5 (coherence of reasoning)
   
    2. Type Accuracy: 1 if correct, 0 if incorrect

    3. Detailed feedback explaining your assessment

    === PLEASE USE THIS EXACT FORMAT ===
    SCORES:
    Trait Consistency: X/5
    Evidence Quality: X/5
    Logical Flow: X/5
    Type Accuracy: X

    FEEDBACK:
    [Your detailed feedback here]

    Text: {text}
    Actual type: {actual_type}
    Reasoning: {reasoning}

    Response: 
    """

    client = OpenAI(
        # If the environment variable is not configured, replace the following line with your API key: api_key="sk-xxx",
        api_key="***",
        base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
    )

    completion = client.chat.completions.create(
        model="qwen2.5-14b-instruct",  # Model list: https://www.alibabacloud.com/help/en/model-studio/getting-started/models
        messages=[
            # {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': prompt}
            ]
    )

    response = completion.choices[0].message.content
    return response


In [5]:
def judge_function(df_name):
    print('Judging Deepseek N0')

    results = []
    for idx, row in df_name.iterrows():
        text = row['text:']
        actual_type = row['actual_type']
        response_msg = row['response']

        response = judge_response(text, actual_type, response_msg)

        results.append({
            "text": text,
            "mbti_actual": actual_type,
            "reasoning": response_msg,
            "judgment": response
        })

        print(f"Processed row {idx + 1}/{len(df_name)}")


    with open('judge_Deepseek_N0.json', 'w') as f:
        json.dump(results, f, indent=4)

    print("Judgement result is saved to 'judge_Deepseek_N0.json'.")

In [None]:
print('Judging llama_N32')

results = []
for idx, row in result_df.iterrows():
    text = row['text']
    actual_type = row['actual_type']
    response_msg = row['response']

    response = judge_response(text, actual_type, response_msg)

    results.append({
        "text": text,
        "mbti_actual": actual_type,
        "reasoning": response_msg,
        "judgment": response
    })

    print(f"Processed row {idx + 1}/{len(result_df)}")


with open('judge_llama_N32.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Judgement result is saved to 'judge_llama_N32.json'.")