In [33]:
# スコアを降順でまとめて表示
import os
import pandas as pd
import json

def load_jsonl_files_from_directory(directory_path):
    # Get all jsonl files in the directory
    jsonl_files = [f for f in os.listdir(directory_path) if f.endswith('.jsonl')]

    # Read all jsonl files into a list of DataFrames
    dataframes = []
    for jsonl_file in jsonl_files:
        file_path = os.path.join(directory_path, jsonl_file)
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            data = [json.loads(line) for line in lines]
            df = pd.DataFrame(data)
            dataframes.append(df)

    # Combine all DataFrames into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

def calculate_and_display_scores(combined_df):
    # Relevant columns for calculating scores
    score_columns = [
        'Roleplay Adherence', 'Consistency', 'Contextual Understanding',
        'Expressiveness', 'Creativity', 'Naturalness of Japanese',
        'Enjoyment of the Dialogue', 'Appropriateness of Turn-Taking'
    ]

    # Group by 'target_model_name' and calculate the average for each score category
    avg_scores_by_model = combined_df.groupby('target_model_name')[score_columns].mean()

    # Calculate the overall average score per model
    avg_scores_by_model['Overall Average'] = avg_scores_by_model.mean(axis=1)

    # Sort the table by 'Overall Average' in descending order
    avg_scores_by_model_sorted = avg_scores_by_model.sort_values(by='Overall Average', ascending=False)

    # Display the resulting table
    #print(avg_scores_by_model_sorted)
    return avg_scores_by_model_sorted

# Directory path containing the jsonl files
directory_path = "./evaluations"

# Load all JSONL files from the directory
combined_df = load_jsonl_files_from_directory(directory_path)

# Calculate and display the scores sorted by overall average
scores = calculate_and_display_scores(combined_df)
scores


Unnamed: 0_level_0,Roleplay Adherence,Consistency,Contextual Understanding,Expressiveness,Creativity,Naturalness of Japanese,Enjoyment of the Dialogue,Appropriateness of Turn-Taking,Overall Average
target_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
claude-3-opus-20240229,4.6,4.791667,4.625,4.091667,3.833333,4.8,4.083333,4.4,4.403125
claude-3-5-sonnet-20240620,4.591667,4.708333,4.616667,4.025,3.966667,4.741667,4.116667,4.408333,4.396875
gpt-4o-mini-2024-07-18,4.691667,4.708333,4.575,3.883333,3.641667,4.716667,3.85,4.525,4.323958
gemini-1.5-pro-002,4.633333,4.683333,4.466667,3.858333,3.658333,4.658333,3.816667,4.366667,4.267708
cyberagent/Mistral-Nemo-Japanese-Instruct-2408,4.508333,4.641667,4.533333,3.85,3.658333,4.675,3.891667,4.366667,4.265625
gpt-4o-2024-08-06,4.616667,4.641667,4.5,3.75,3.541667,4.708333,3.75,4.425,4.241667
command-r-plus-08-2024,4.616667,4.633333,4.425,3.708333,3.55,4.65,3.733333,4.408333,4.215625
Qwen/Qwen2.5-72B-Instruct,4.658333,4.65,4.458333,3.725,3.533333,4.608333,3.691667,4.325,4.20625
gemini-1.5-pro,4.475,4.6,4.425,3.775,3.558333,4.65,3.725,4.416667,4.203125
o1-preview-2024-09-12,4.625,4.65,4.383333,3.641667,3.416667,4.6,3.616667,4.5,4.179167


In [None]:
!pip install scipy

In [31]:
# 人手評価とのスピアマン順位相関係数を計算して表示
# 4つのJudge Modelのスコアについて、単体のスコアとの相関・2モデル平均との相関・3モデル平均との相関・4モデル全体平均との相関を全て表示
import pandas as pd
import json
from scipy.stats import spearmanr
from itertools import combinations

def process_file(file_path):
    # Load the JSONL file
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))

    # Convert the loaded data into a DataFrame
    df = pd.json_normalize(data)

    # Extract relevant columns for human and LLM evaluations
    human_scores_cols = [col for col in df.columns if col.startswith('human_scores.')]
    llm_scores_cols = [
        "Roleplay Adherence", "Consistency", "Contextual Understanding",
        "Expressiveness", "Creativity", "Naturalness of Japanese",
        "Enjoyment of the Dialogue", "Appropriateness of Turn-Taking"
    ]

    # Calculate overall Spearman correlation for each metric
    overall_correlations = {}
    for llm_col, human_col in zip(llm_scores_cols, human_scores_cols):
        correlation, _ = spearmanr(df[llm_col], df[human_col])
        overall_correlations[llm_col] = correlation

    # Extract individual evaluations and calculate correlations for each judge model
    individual_evaluations = df['individual_evaluations'].explode().dropna()
    individual_evaluations_df = pd.json_normalize(individual_evaluations)

    judge_correlations = {}
    judge_model_names = individual_evaluations_df['judge_model_name'].unique()

    for judge_model_name in judge_model_names:
        judge_df = individual_evaluations_df[individual_evaluations_df['judge_model_name'] == judge_model_name]
        correlations = {}
        for llm_col, human_col in zip(llm_scores_cols, human_scores_cols):
            if llm_col in judge_df.columns and human_col in df.columns:
                correlation, _ = spearmanr(judge_df[llm_col], df[human_col])
                correlations[llm_col] = correlation
        judge_correlations[judge_model_name] = correlations

    # Combine overall correlations and individual judge correlations into a single DataFrame
    correlation_results = pd.DataFrame({"Overall": overall_correlations})
    for judge_model_name, correlations in judge_correlations.items():
        correlation_results[judge_model_name] = pd.Series(correlations)

    # Calculate the average score correlations
    df['human_average_score'] = df[human_scores_cols].mean(axis=1)
    df['llm_average_score'] = df[llm_scores_cols].mean(axis=1)

    # Calculate the overall average correlation
    overall_average_correlation, _ = spearmanr(df['llm_average_score'], df['human_average_score'])

    # Average score correlations for individual judge models
    individual_evaluations_df['llm_average_score'] = individual_evaluations_df[llm_scores_cols].mean(axis=1)
    average_judge_correlations = {}
    for judge_model_name in judge_model_names:
        judge_df = individual_evaluations_df[individual_evaluations_df['judge_model_name'] == judge_model_name]
        correlation, _ = spearmanr(judge_df['llm_average_score'], df['human_average_score'])
        average_judge_correlations[judge_model_name] = correlation

    # Add the average correlations to the combined table
    correlation_results.loc['Average Score'] = pd.Series(average_judge_correlations)
    correlation_results['Overall'] = correlation_results['Overall'].fillna(overall_average_correlation)

    # Define unique judge models
    unique_judge_models = [
        "gpt-4o-2024-08-06", "o1-mini-2024-09-12",
        "anthropic.claude-3-5-sonnet-20240620-v1:0", "gemini-1.5-pro-002"
    ]

    # Calculate Spearman correlations for all 2-model and 3-model combinations
    combination_correlations = {}
    for k in [2, 3]:
        for comb in combinations(unique_judge_models, k):
            comb_name = "_".join(comb)
            correlations = {}
            for score_name in llm_scores_cols:
                avg_col_name = f"averages_{k}_models.avg_{'_'.join(comb)}.{score_name}"
                if avg_col_name in df.columns:
                    correlation, _ = spearmanr(df[avg_col_name], df[f"human_scores.{score_name}"])
                    correlations[score_name] = correlation

            # Calculate the overall average score correlation for the combination
            df[f"{comb_name}_average_score"] = df[[f"averages_{k}_models.avg_{'_'.join(comb)}.{metric}"
                                                   for metric in llm_scores_cols]].mean(axis=1)
            correlation, _ = spearmanr(df[f"{comb_name}_average_score"], df['human_average_score'])
            correlations['Average Score'] = correlation

            combination_correlations[comb_name] = correlations

    # Convert combination correlations to a DataFrame
    combination_correlation_results = pd.DataFrame(combination_correlations)

    # Combine the original and combination results into a single comprehensive table
    final_combined_results = pd.concat([correlation_results, combination_correlation_results], axis=1)

    # Display the final comprehensive correlation results
    return final_combined_results

# Example usage:
process_file('./test/modified_annotated_sample.jsonl')


Unnamed: 0,Overall,gpt-4o-2024-08-06,o1-mini-2024-09-12,anthropic.claude-3-5-sonnet-20240620-v1:0,gemini-1.5-pro-002,gpt-4o-2024-08-06_o1-mini-2024-09-12,gpt-4o-2024-08-06_anthropic.claude-3-5-sonnet-20240620-v1:0,gpt-4o-2024-08-06_gemini-1.5-pro-002,o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0,o1-mini-2024-09-12_gemini-1.5-pro-002,anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002,gpt-4o-2024-08-06_o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0,gpt-4o-2024-08-06_o1-mini-2024-09-12_gemini-1.5-pro-002,gpt-4o-2024-08-06_anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002,o1-mini-2024-09-12_anthropic.claude-3-5-sonnet-20240620-v1:0_gemini-1.5-pro-002
Roleplay Adherence,0.632473,0.473109,0.460139,0.2904,0.539547,0.604012,0.391803,0.619375,0.479238,0.629825,0.464058,0.579314,0.684168,0.522206,0.578193
Consistency,0.52039,0.57613,0.500888,0.194717,0.446015,0.640674,0.411669,0.566035,0.406102,0.491425,0.286646,0.553909,0.613499,0.434632,0.389918
Contextual Understanding,0.526451,0.416179,0.524996,0.309361,0.483984,0.555817,0.392728,0.497801,0.458816,0.56317,0.403141,0.50652,0.585895,0.450092,0.495829
Expressiveness,0.560001,0.391092,0.477326,0.420209,0.470158,0.519313,0.494232,0.502739,0.499657,0.516655,0.474328,0.549841,0.555042,0.520735,0.521352
Creativity,0.429796,0.347042,0.294125,0.396245,0.462311,0.374478,0.420484,0.427139,0.38644,0.383614,0.422334,0.406254,0.408472,0.441738,0.400932
Naturalness of Japanese,0.554593,0.48394,0.565689,0.386479,0.548344,0.559631,0.494055,0.544961,0.510059,0.563773,0.515218,0.540617,0.566311,0.544976,0.543812
Enjoyment of the Dialogue,0.504074,0.199664,0.437892,0.48092,0.442609,0.398852,0.436683,0.375542,0.525387,0.474757,0.497515,0.506834,0.442266,0.462983,0.52382
Appropriateness of Turn-Taking,0.617291,0.530897,0.288324,0.487605,0.360864,0.484505,0.603742,0.5729,0.450075,0.384404,0.491124,0.576536,0.564496,0.635467,0.482371
Average Score,0.601041,0.426126,0.462953,0.427334,0.55354,0.547215,0.507034,0.563607,0.502963,0.560185,0.517369,0.566939,0.598594,0.553735,0.549003
