In [74]:
# スコアを降順でまとめて表示
import json
import os

import pandas as pd


def load_jsonl_files_from_directory(directory_path):
    # Get all jsonl files in the directory
    jsonl_files = [f for f in os.listdir(directory_path) if f.endswith(".jsonl")]

    # Read all jsonl files into a list of DataFrames
    dataframes = []
    for jsonl_file in jsonl_files:
        file_path = os.path.join(directory_path, jsonl_file)
        with open(file_path, "r", encoding="utf-8") as file:
            lines = file.readlines()
            data = [json.loads(line) for line in lines]
            df = pd.DataFrame(data)
            dataframes.append(df)

    # Combine all DataFrames into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df


def calculate_and_display_scores(combined_df):
    # Relevant columns for calculating scores
    score_columns = [
        "Roleplay Adherence",
        "Consistency",
        "Contextual Understanding",
        "Expressiveness",
        "Creativity",
        "Naturalness of Japanese",
        "Enjoyment of the Dialogue",
        "Appropriateness of Turn-Taking",
    ]

    # Group by 'target_model_name' and calculate the average for each score category
    avg_scores_by_model = round(combined_df.groupby("target_model_name")[score_columns].mean(), 3)

    # Calculate the overall average score per model
    avg_scores_by_model["Overall Average"] = round(avg_scores_by_model.mean(axis=1), 3)

    cols = ["Overall Average"] + [col for col in avg_scores_by_model.columns if col != "Overall Average"]
    avg_scores_by_model = avg_scores_by_model[cols]

    # Sort the table by 'Overall Average' in descending order
    avg_scores_by_model_sorted = avg_scores_by_model.sort_values(
        by="Overall Average", ascending=False
    )

    # Display the resulting table
    # print(avg_scores_by_model_sorted)
    return avg_scores_by_model_sorted


# Directory path containing the jsonl files
directory_path = "./evaluations"

# Load all JSONL files from the directory
combined_df = load_jsonl_files_from_directory(directory_path)

# Calculate and display the scores sorted by overall average
scores = calculate_and_display_scores(combined_df)
scores


Unnamed: 0_level_0,Overall Average,Roleplay Adherence,Consistency,Contextual Understanding,Expressiveness,Creativity,Naturalness of Japanese,Enjoyment of the Dialogue,Appropriateness of Turn-Taking
target_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
claude-3-opus-20240229,4.403,4.6,4.792,4.625,4.092,3.833,4.8,4.083,4.4
claude-3-5-sonnet-20240620,4.397,4.592,4.708,4.617,4.025,3.967,4.742,4.117,4.408
gpt-4o-mini-2024-07-18,4.324,4.692,4.708,4.575,3.883,3.642,4.717,3.85,4.525
gemini-1.5-pro-002,4.268,4.633,4.683,4.467,3.858,3.658,4.658,3.817,4.367
cyberagent/Mistral-Nemo-Japanese-Instruct-2408,4.266,4.508,4.642,4.533,3.85,3.658,4.675,3.892,4.367
gpt-4o-2024-08-06,4.242,4.617,4.642,4.5,3.75,3.542,4.708,3.75,4.425
command-r-plus-08-2024,4.216,4.617,4.633,4.425,3.708,3.55,4.65,3.733,4.408
Qwen/Qwen2.5-72B-Instruct,4.206,4.658,4.65,4.458,3.725,3.533,4.608,3.692,4.325
gemini-1.5-pro,4.203,4.475,4.6,4.425,3.775,3.558,4.65,3.725,4.417
o1-preview-2024-09-12,4.179,4.625,4.65,4.383,3.642,3.417,4.6,3.617,4.5


In [None]:
!pip install scipy

In [80]:
# 人手評価とのスピアマン順位相関係数を計算して表示
# 4つのJudge Modelのスコアについて、単体のスコアとの相関・2モデル平均との相関・3モデル平均との相関・4モデル全体平均との相関を全て表示
import json
from itertools import combinations

import pandas as pd
from scipy.stats import spearmanr


def process_file(file_path):
    # Load the JSONL file
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line))

    # Convert the loaded data into a DataFrame
    df = pd.json_normalize(data)

    # Extract relevant columns for human and LLM evaluations
    human_scores_cols = [col for col in df.columns if col.startswith("human_scores.")]
    llm_scores_cols = [
        "Roleplay Adherence",
        "Consistency",
        "Contextual Understanding",
        "Expressiveness",
        "Creativity",
        "Naturalness of Japanese",
        "Enjoyment of the Dialogue",
        "Appropriateness of Turn-Taking",
    ]

    # Calculate overall Spearman correlation for each metric
    overall_correlations = {}
    for llm_col, human_col in zip(llm_scores_cols, human_scores_cols):
        correlation, _ = spearmanr(df[llm_col], df[human_col])
        overall_correlations[llm_col] = round(correlation, 3)

    # Extract individual evaluations and calculate correlations for each judge model
    individual_evaluations = df["individual_evaluations"].explode().dropna()
    individual_evaluations_df = pd.json_normalize(individual_evaluations)

    judge_correlations = {}
    judge_model_names = individual_evaluations_df["judge_model_name"].unique()

    for judge_model_name in judge_model_names:
        judge_df = individual_evaluations_df[
            individual_evaluations_df["judge_model_name"] == judge_model_name
        ]
        correlations = {}
        for llm_col, human_col in zip(llm_scores_cols, human_scores_cols):
            if llm_col in judge_df.columns and human_col in df.columns:
                correlation, _ = spearmanr(judge_df[llm_col], df[human_col])
                correlations[llm_col] = round(correlation, 3)
        judge_correlations[judge_model_name] = correlations

    # Combine overall correlations and individual judge correlations into a single DataFrame
    correlation_results = pd.DataFrame({"Overall": overall_correlations})
    for judge_model_name, correlations in judge_correlations.items():
        correlation_results[judge_model_name] = pd.Series(correlations)

    # Calculate the average score correlations
    df["human_average_score"] = df[human_scores_cols].mean(axis=1)
    df["llm_average_score"] = df[llm_scores_cols].mean(axis=1)

    # Calculate the overall average correlation
    overall_average_correlation, _ = spearmanr(
        df["llm_average_score"], df["human_average_score"]
    )
    overall_average_correlation = round(overall_average_correlation, 3)

    # Average score correlations for individual judge models
    individual_evaluations_df["llm_average_score"] = individual_evaluations_df[
        llm_scores_cols
    ].mean(axis=1)
    average_judge_correlations = {}
    for judge_model_name in judge_model_names:
        judge_df = individual_evaluations_df[
            individual_evaluations_df["judge_model_name"] == judge_model_name
        ]
        correlation, _ = spearmanr(
            judge_df["llm_average_score"], df["human_average_score"]
        )
        average_judge_correlations[judge_model_name] = round(correlation, 3)

    # Add the average correlations to the combined table
    correlation_results.loc["Average Score"] = pd.Series(average_judge_correlations)
    correlation_results["Overall"] = correlation_results["Overall"].fillna(
        overall_average_correlation
    )

    # Define unique judge models
    unique_judge_models = [
        "gpt-4o-2024-08-06",
        "o1-mini-2024-09-12",
        "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "gemini-1.5-pro-002",
    ]

    # Calculate Spearman correlations for all 2-model and 3-model combinations
    combination_correlations = {}
    for k in [2, 3]:
        for comb in combinations(unique_judge_models, k):
            comb_name = "_".join(comb)
            correlations = {}
            for score_name in llm_scores_cols:
                avg_col_name = f"averages_{k}_models.avg_{'_'.join(comb)}.{score_name}"
                if avg_col_name in df.columns:
                    correlation, _ = spearmanr(
                        df[avg_col_name], df[f"human_scores.{score_name}"]
                    )
                    correlations[score_name] = round(correlation, 3)

            # Calculate the overall average score correlation for the combination
            df[f"{comb_name}_average_score"] = df[
                [
                    f"averages_{k}_models.avg_{'_'.join(comb)}.{metric}"
                    for metric in llm_scores_cols
                ]
            ].mean(axis=1)
            correlation, _ = spearmanr(
                df[f"{comb_name}_average_score"], df["human_average_score"]
            )
            correlations["Average Score"] = round(correlation, 3)

            combination_correlations[comb_name] = correlations

    # Convert combination correlations to a DataFrame
    combination_correlation_results = pd.DataFrame(combination_correlations)

    # Combine the original and combination results into a single comprehensive table
    final_combined_results = pd.concat(
        [correlation_results, combination_correlation_results], axis=1
    )

    # Display the final comprehensive correlation results
    return final_combined_results


final_combined_results = process_file("./annotated_sample/annotated_sample.jsonl")
final_combined_results


Unnamed: 0,Overall,gpt-4o-2024-08-06,o1-mini-2024-09-12,anthropic.claude-3-5-sonnet-20240620-v1:0,gemini-1.5-pro-002,gpt-4o-2024-08-06_o1-mini-2024-09-12,gpt-4o-2024-08-06_anthropic.claude-3-5-sonnet-20240620-v1:0,gpt-4o-2024-08-06_gemini-1.5-pro-002,o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0,o1-mini-2024-09-12_gemini-1.5-pro-002,anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002,gpt-4o-2024-08-06_o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0,gpt-4o-2024-08-06_o1-mini-2024-09-12_gemini-1.5-pro-002,gpt-4o-2024-08-06_anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002,o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002
Roleplay Adherence,0.632,0.473,0.46,0.29,0.54,0.604,0.392,0.619,0.479,0.63,0.464,0.579,0.684,0.522,0.578
Consistency,0.52,0.576,0.501,0.195,0.446,0.641,0.412,0.566,0.406,0.491,0.287,0.554,0.613,0.435,0.39
Contextual Understanding,0.526,0.416,0.525,0.309,0.484,0.556,0.393,0.498,0.459,0.563,0.403,0.507,0.586,0.45,0.496
Expressiveness,0.56,0.391,0.477,0.42,0.47,0.519,0.494,0.503,0.5,0.517,0.474,0.55,0.555,0.521,0.521
Creativity,0.43,0.347,0.294,0.396,0.462,0.374,0.42,0.427,0.386,0.384,0.422,0.406,0.408,0.442,0.401
Naturalness of Japanese,0.555,0.484,0.566,0.386,0.548,0.56,0.494,0.545,0.51,0.564,0.515,0.541,0.566,0.545,0.544
Enjoyment of the Dialogue,0.504,0.2,0.438,0.481,0.443,0.399,0.437,0.376,0.525,0.475,0.498,0.507,0.442,0.463,0.524
Appropriateness of Turn-Taking,0.617,0.531,0.288,0.488,0.361,0.485,0.604,0.573,0.45,0.384,0.491,0.577,0.564,0.635,0.482
Average Score,0.601,0.426,0.463,0.427,0.554,0.547,0.507,0.564,0.503,0.56,0.517,0.567,0.599,0.554,0.549
