In [1]:
import os
import json
import re
from collections import Counter
import pandas as pd

def extract_score(text):
    matches = re.findall(r'\"overall quality\":\s*([1-5])', text)
    if matches:
        return int(matches[-1])  
    return None

def majority_vote(scores):
    if not scores:
        return None
    return Counter(scores).most_common(1)[0][0]

def process_file(dimension):
    file_path = f"./Reasoning_tmp1_{dimension}_overall_gpt-4o-mini-2024-07-18_0105.jsonl_DeepSeek-R1-Distill-Qwen-32B_oversampling.json"
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    results = []
    for item in data:
        votes = []
        for resp in item.get("judge_responses", []):
            print(f"Processing response: {resp}")
            score = extract_score(resp)
            if score is not None:
                votes.append(score)
            print(f"Extracted score: {score}")
        mv = majority_vote(votes)
        results.append(mv)

    return results


In [None]:
cosmos_results = process_file("cosmos")

In [None]:
drop_results   = process_file("drop")

In [None]:
esnli_results  = process_file("esnli")

In [None]:
gsm8k_results  = process_file("gsm8k")

In [6]:
gsm8k_results



[2,
 5,
 5,
 1,
 2,
 3,
 5,
 5,
 2,
 5,
 5,
 2,
 5,
 5,
 2,
 3,
 2,
 2,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 3,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 2,
 5,
 3,
 5,
 5,
 2,
 3,
 5,
 2,
 5,
 2,
 2,
 5,
 3,
 5,
 5,
 5,
 5,
 1,
 2,
 2,
 5,
 3,
 2,
 5,
 2,
 2,
 5,
 5,
 2,
 2,
 5,
 5,
 5,
 5,
 2,
 2,
 5,
 2,
 5,
 5,
 2,
 5,
 2,
 5,
 2,
 5,
 5,
 5,
 2]

In [7]:
import numpy as np
import scipy.stats as stats

def calculate_performance(y_pred, y_test):
    if np.isnan(y_pred).any() or np.isnan(y_test).any():
        mask = ~np.isnan(y_pred) & ~np.isnan(y_test)
        y_pred = y_pred[mask]
        y_test = y_test[mask]
    mse = np.mean((y_pred - y_test) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_pred - y_test))

    rho = stats.spearmanr(y_pred, y_test)[0]
    tau = stats.kendalltau(y_pred, y_test)[0]
    pcc = stats.pearsonr(y_pred, y_test)[0]
    return mse, rmse, mae, rho, tau, pcc



In [8]:
label =pd.read_csv("R2CCP_SocREval_gsm8k_30.csv")
y_test = label["y_test"].to_numpy()
calculate_performance(gsm8k_results, y_test)

(0.91,
 0.9539392014169457,
 0.45,
 0.8175391526792112,
 0.7485471436762252,
 0.8375390263960066)

In [9]:
label =pd.read_csv("R2CCP_SocREval_cosmos_1.csv")
y_test = label["y_test"].to_numpy()
calculate_performance(cosmos_results, y_test)

(2.061224489795918,
 1.4356965173029843,
 1.1428571428571428,
 0.45382868713319746,
 0.3897353856568439,
 0.4553244579385748)

In [10]:
label =pd.read_csv("R2CCP_SocREval_drop_18.csv")
y_test = label["y_test"].to_numpy()
calculate_performance(drop_results, y_test)

(1.161904761904762,
 1.0779168622415933,
 0.7238095238095238,
 0.6365563560776474,
 0.5833159927658976,
 0.7153296923113484)

In [11]:
label =pd.read_csv("R2CCP_SocREval_esnli_9.csv")
y_test = label["y_test"].to_numpy()
calculate_performance(esnli_results, y_test)

(0.7631578947368421,
 0.8735890880367281,
 0.631578947368421,
 0.5484920190595154,
 0.5033632903358916,
 0.5589556903475104)