In [24]:
import pandas as pd

def evaluate_human_ranking(df: pd.DataFrame):
    expected_columns = ["dialogue", "validation", "faithfulness", "relevance", "icf_alignment"]
    for i, name in enumerate(expected_columns):
        if i < len(df.columns):
            df.columns.values[i] = name

    model_rows = df[df["dialogue"].str.contains(r"(?i)^model\d+", na=False)].copy()
    model_rows["model"] = model_rows["dialogue"].str.extract(r"(?i)(model\d+)")[0].str.lower()

    if "validation" in model_rows.columns:
        model_rows["validation"] = model_rows["validation"].astype(str).str.lower()

    # 转为数值
    for col in ["relevance", "icf_alignment"]:
        if col in model_rows.columns:
            model_rows[col] = pd.to_numeric(model_rows[col], errors="coerce")

    # 更宽松的 invalid 统计：只要包含 "invalid" 即计入
    model_rows["relevance_invalid"] = model_rows["validation"].str.contains("invalid", na=False)
    model_rows["icf_invalid"] = model_rows["validation"].str.contains("invalid", na=False)

    def rank_distribution(series):
        return (
            series.value_counts(dropna=True)
                  .reindex([1, 2, 3, 4, 5], fill_value=0)
        )

    # 排名分布
    relevance_rank_dist = (
        model_rows.groupby("model")["relevance"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("rel_rank_")
    )
    icf_rank_dist = (
        model_rows.groupby("model")["icf_alignment"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("icf_rank_")
    )

    # 汇总统计（含 <2.5 的数量）
    summary = (
        model_rows.groupby("model").agg(
            valid_relevance_count=("relevance", lambda x: x.notna().sum()),
            invalid_relevance_count=("relevance_invalid", "sum"),
            avg_relevance_rank=("relevance", "mean"),
            rel_below_2_5_count=("relevance", lambda x: (x < 2.5).sum()),

            valid_icf_count=("icf_alignment", lambda x: x.notna().sum()),
            invalid_icf_count=("icf_invalid", "sum"),
            avg_icf_rank=("icf_alignment", "mean"),
            icf_below_2_5_count=("icf_alignment", lambda x: (x < 2.5).sum()),
        )
        .round(2)
    )

    combined = pd.concat([summary, relevance_rank_dist, icf_rank_dist], axis=1)
    combined["top1_rel_count"] = combined["rel_rank_1"]
    combined["top1_icf_count"] = combined["icf_rank_1"]

    # ✅ 计算 valid ratio（百分比格式，保留两位小数）
    combined["rel_valid_ratio"] = (
        100 * combined["valid_relevance_count"] /
        (combined["valid_relevance_count"] + combined["invalid_relevance_count"])
    ).round(2).astype(str) + "%"

    combined["icf_valid_ratio"] = (
        100 * combined["valid_icf_count"] /
        (combined["valid_icf_count"] + combined["invalid_icf_count"])
    ).round(2).astype(str) + "%"

    # ✅ 计算 <2.5 的比例（百分比格式，保留两位小数；分母为有效打分数量）
    rel_den = combined["valid_relevance_count"].replace(0, pd.NA)
    icf_den = combined["valid_icf_count"].replace(0, pd.NA)

    combined["rel_below_2_5_ratio"] = (
        (100 * combined["rel_below_2_5_count"] / rel_den).round(2).astype(str) + "%"
    )
    combined["icf_below_2_5_ratio"] = (
        (100 * combined["icf_below_2_5_count"] / icf_den).round(2).astype(str) + "%"
    )

    return combined.reset_index()

# ===== 主程序 =====
input_tsv = "Edwin_combined.tsv"  # TODO: 换成你的文件名

df = pd.read_csv(input_tsv, sep="\t", dtype=str)
result_df = evaluate_human_ranking(df)

print(result_df)
result_df.to_csv("Edwin_combined_eval_summary_single_file.tsv", sep="\t", index=False)


    model  valid_relevance_count  invalid_relevance_count  avg_relevance_rank  \
0  model1                    122                       17                1.57   
1  model2                    125                       14                1.52   
2  model3                    118                       13                1.80   
3  model4                    118                       11                1.91   
4  model5                    122                        7                1.68   

   rel_below_2_5_count  valid_icf_count  invalid_icf_count  avg_icf_rank  \
0                  101              124                 17          1.79   
1                  106              125                 14          1.71   
2                   89              118                 13          2.06   
3                   83              118                 11          2.09   
4                   98              122                  7          1.91   

   icf_below_2_5_count  rel_rank_1  ...  icf_rank_2  icf

In [30]:
import pandas as pd

def evaluate_human_ranking(df: pd.DataFrame):
    expected_columns = ["dialogue", "validation", "faithfulness", "relevance", "icf_alignment"]
    for i, name in enumerate(expected_columns):
        if i < len(df.columns):
            df.columns.values[i] = name

    model_rows = df[df["dialogue"].str.contains(r"(?i)^model\d+", na=False)].copy()
    model_rows["model"] = model_rows["dialogue"].str.extract(r"(?i)(model\d+)")[0].str.lower()

    if "validation" in model_rows.columns:
        model_rows["validation"] = model_rows["validation"].astype(str).str.lower()

    # 转为数值
    for col in ["relevance", "icf_alignment"]:
        if col in model_rows.columns:
            model_rows[col] = pd.to_numeric(model_rows[col], errors="coerce")

    # 更宽松的 invalid 统计：只要包含 "invalid" 即计入
    model_rows["relevance_invalid"] = model_rows["validation"].str.contains("invalid", na=False)
    model_rows["icf_invalid"] = model_rows["validation"].str.contains("invalid", na=False)

    def rank_distribution(series):
        return (
            series.value_counts(dropna=True)
                  .reindex([1, 2, 3, 4, 5], fill_value=0)
        )

    # 排名分布
    relevance_rank_dist = (
        model_rows.groupby("model")["relevance"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("rel_rank_")
    )
    icf_rank_dist = (
        model_rows.groupby("model")["icf_alignment"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("icf_rank_")
    )

    # 汇总统计（含 <2.5 的数量）
    summary = (
        model_rows.groupby("model").agg(
            valid_relevance_count=("relevance", lambda x: x.notna().sum()),
            invalid_relevance_count=("relevance_invalid", "sum"),
            avg_relevance_rank=("relevance", "mean"),
            rel_below_2_5_count=("relevance", lambda x: (x < 2.5).sum()),

            valid_icf_count=("icf_alignment", lambda x: x.notna().sum()),
            invalid_icf_count=("icf_invalid", "sum"),
            avg_icf_rank=("icf_alignment", "mean"),
            icf_below_2_5_count=("icf_alignment", lambda x: (x < 2.5).sum()),
        )
        .round(2)
    )

    combined = pd.concat([summary, relevance_rank_dist, icf_rank_dist], axis=1)
    combined["top1_rel_count"] = combined["rel_rank_1"]
    combined["top1_icf_count"] = combined["icf_rank_1"]

    # ✅ 计算 valid ratio（百分比格式，保留两位小数）
    combined["rel_valid_ratio"] = (
        100 * combined["valid_relevance_count"] /
        (combined["valid_relevance_count"] + combined["invalid_relevance_count"])
    ).round(2).astype(str) + "%"

    combined["icf_valid_ratio"] = (
        100 * combined["valid_icf_count"] /
        (combined["valid_icf_count"] + combined["invalid_icf_count"])
    ).round(2).astype(str) + "%"

    # ✅ 计算 <2.5 的比例（百分比格式，保留两位小数；分母为有效打分数量）
    rel_den = combined["valid_relevance_count"].replace(0, pd.NA)
    icf_den = combined["valid_icf_count"].replace(0, pd.NA)

    combined["rel_below_2_5_ratio"] = (
        (100 * combined["rel_below_2_5_count"] / rel_den).round(2).astype(str) + "%"
    )
    combined["icf_below_2_5_ratio"] = (
        (100 * combined["icf_below_2_5_count"] / icf_den).round(2).astype(str) + "%"
    )

    return combined.reset_index()

# ===== 主程序 =====
input_tsv = "Sabina_combined.tsv"  # TODO: 换成你的文件名

df = pd.read_csv(input_tsv, sep="\t", dtype=str)
result_df = evaluate_human_ranking(df)

print(result_df)
result_df.to_csv("Sabina_eval_summary_single_file.tsv", sep="\t", index=False)


    model  valid_relevance_count  invalid_relevance_count  avg_relevance_rank  \
0  model1                     87                        3                1.39   
1  model2                     83                        8                1.41   
2  model3                     77                        6                1.43   
3  model4                     79                        5                1.37   
4  model5                     81                       12                1.54   

   rel_below_2_5_count  valid_icf_count  invalid_icf_count  avg_icf_rank  \
0                   77               88                  3          2.27   
1                   72               83                  8          2.49   
2                   68               77                  6          3.31   
3                   69               78                  5          3.06   
4                   67               81                 12          3.33   

   icf_below_2_5_count  rel_rank_1  ...  icf_rank_2  icf

In [26]:
import pandas as pd

def evaluate_human_ranking(df: pd.DataFrame):
    expected_columns = ["dialogue", "validation", "faithfulness", "relevance", "icf_alignment"]
    for i, name in enumerate(expected_columns):
        if i < len(df.columns):
            df.columns.values[i] = name

    model_rows = df[df["dialogue"].str.contains(r"(?i)^model\d+", na=False)].copy()
    model_rows["model"] = model_rows["dialogue"].str.extract(r"(?i)(model\d+)")[0].str.lower()

    if "validation" in model_rows.columns:
        model_rows["validation"] = model_rows["validation"].astype(str).str.lower()

    # 转为数值
    for col in ["relevance", "icf_alignment"]:
        if col in model_rows.columns:
            model_rows[col] = pd.to_numeric(model_rows[col], errors="coerce")

    # 更宽松的 invalid 统计：只要包含 "invalid" 即计入
    model_rows["relevance_invalid"] = model_rows["validation"].str.contains("invalid", na=False)
    model_rows["icf_invalid"] = model_rows["validation"].str.contains("invalid", na=False)

    def rank_distribution(series):
        return (
            series.value_counts(dropna=True)
                  .reindex([1, 2, 3, 4, 5], fill_value=0)
        )

    # 排名分布
    relevance_rank_dist = (
        model_rows.groupby("model")["relevance"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("rel_rank_")
    )
    icf_rank_dist = (
        model_rows.groupby("model")["icf_alignment"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("icf_rank_")
    )

    # 汇总统计（含 <2.5 的数量）
    summary = (
        model_rows.groupby("model").agg(
            valid_relevance_count=("relevance", lambda x: x.notna().sum()),
            invalid_relevance_count=("relevance_invalid", "sum"),
            avg_relevance_rank=("relevance", "mean"),
            rel_below_2_5_count=("relevance", lambda x: (x < 2.5).sum()),

            valid_icf_count=("icf_alignment", lambda x: x.notna().sum()),
            invalid_icf_count=("icf_invalid", "sum"),
            avg_icf_rank=("icf_alignment", "mean"),
            icf_below_2_5_count=("icf_alignment", lambda x: (x < 2.5).sum()),
        )
        .round(2)
    )

    combined = pd.concat([summary, relevance_rank_dist, icf_rank_dist], axis=1)
    combined["top1_rel_count"] = combined["rel_rank_1"]
    combined["top1_icf_count"] = combined["icf_rank_1"]

    # ✅ 计算 valid ratio（百分比格式，保留两位小数）
    combined["rel_valid_ratio"] = (
        100 * combined["valid_relevance_count"] /
        (combined["valid_relevance_count"] + combined["invalid_relevance_count"])
    ).round(2).astype(str) + "%"

    combined["icf_valid_ratio"] = (
        100 * combined["valid_icf_count"] /
        (combined["valid_icf_count"] + combined["invalid_icf_count"])
    ).round(2).astype(str) + "%"

    # ✅ 计算 <2.5 的比例（百分比格式，保留两位小数；分母为有效打分数量）
    rel_den = combined["valid_relevance_count"].replace(0, pd.NA)
    icf_den = combined["valid_icf_count"].replace(0, pd.NA)

    combined["rel_below_2_5_ratio"] = (
        (100 * combined["rel_below_2_5_count"] / rel_den).round(2).astype(str) + "%"
    )
    combined["icf_below_2_5_ratio"] = (
        (100 * combined["icf_below_2_5_count"] / icf_den).round(2).astype(str) + "%"
    )

    return combined.reset_index()

# ===== 主程序 =====
input_tsv = "Marike_combined.tsv"  # TODO: 换成你的文件名

df = pd.read_csv(input_tsv, sep="\t", dtype=str)
result_df = evaluate_human_ranking(df)

print(result_df)
result_df.to_csv("Marike_combined_eval_summary_single_file.tsv", sep="\t", index=False)


    model  valid_relevance_count  invalid_relevance_count  avg_relevance_rank  \
0  model1                     66                        0                1.36   
1  model2                     65                        1                1.42   
2  model3                     64                        4                2.28   
3  model4                     66                        1                2.64   
4  model5                     65                        4                2.98   

   rel_below_2_5_count  valid_icf_count  invalid_icf_count  avg_icf_rank  \
0                   59               66                  0          1.70   
1                   60               65                  1          1.68   
2                   39               64                  4          2.64   
3                   28               66                  1          2.91   
4                   24               65                  4          3.37   

   icf_below_2_5_count  rel_rank_1  ...  icf_rank_2  icf

In [28]:
import pandas as pd

def evaluate_human_ranking(df: pd.DataFrame):
    expected_columns = ["dialogue", "validation", "faithfulness", "relevance", "icf_alignment"]
    for i, name in enumerate(expected_columns):
        if i < len(df.columns):
            df.columns.values[i] = name

    model_rows = df[df["dialogue"].str.contains(r"(?i)^model\d+", na=False)].copy()
    model_rows["model"] = model_rows["dialogue"].str.extract(r"(?i)(model\d+)")[0].str.lower()

    if "validation" in model_rows.columns:
        model_rows["validation"] = model_rows["validation"].astype(str).str.lower()

    # 转为数值
    for col in ["relevance", "icf_alignment"]:
        if col in model_rows.columns:
            model_rows[col] = pd.to_numeric(model_rows[col], errors="coerce")

    # 更宽松的 invalid 统计：只要包含 "invalid" 即计入
    model_rows["relevance_invalid"] = model_rows["validation"].str.contains("invalid", na=False)
    model_rows["icf_invalid"] = model_rows["validation"].str.contains("invalid", na=False)

    def rank_distribution(series):
        return (
            series.value_counts(dropna=True)
                  .reindex([1, 2, 3, 4, 5], fill_value=0)
        )

    # 排名分布
    relevance_rank_dist = (
        model_rows.groupby("model")["relevance"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("rel_rank_")
    )
    icf_rank_dist = (
        model_rows.groupby("model")["icf_alignment"]
        .apply(rank_distribution)
        .unstack()
        .add_prefix("icf_rank_")
    )

    # 汇总统计（含 <2.5 的数量）
    summary = (
        model_rows.groupby("model").agg(
            valid_relevance_count=("relevance", lambda x: x.notna().sum()),
            invalid_relevance_count=("relevance_invalid", "sum"),
            avg_relevance_rank=("relevance", "mean"),
            rel_below_2_5_count=("relevance", lambda x: (x < 2.5).sum()),

            valid_icf_count=("icf_alignment", lambda x: x.notna().sum()),
            invalid_icf_count=("icf_invalid", "sum"),
            avg_icf_rank=("icf_alignment", "mean"),
            icf_below_2_5_count=("icf_alignment", lambda x: (x < 2.5).sum()),
        )
        .round(2)
    )

    combined = pd.concat([summary, relevance_rank_dist, icf_rank_dist], axis=1)
    combined["top1_rel_count"] = combined["rel_rank_1"]
    combined["top1_icf_count"] = combined["icf_rank_1"]

    # ✅ 计算 valid ratio（百分比格式，保留两位小数）
    combined["rel_valid_ratio"] = (
        100 * combined["valid_relevance_count"] /
        (combined["valid_relevance_count"] + combined["invalid_relevance_count"])
    ).round(2).astype(str) + "%"

    combined["icf_valid_ratio"] = (
        100 * combined["valid_icf_count"] /
        (combined["valid_icf_count"] + combined["invalid_icf_count"])
    ).round(2).astype(str) + "%"

    # ✅ 计算 <2.5 的比例（百分比格式，保留两位小数；分母为有效打分数量）
    rel_den = combined["valid_relevance_count"].replace(0, pd.NA)
    icf_den = combined["valid_icf_count"].replace(0, pd.NA)

    combined["rel_below_2_5_ratio"] = (
        (100 * combined["rel_below_2_5_count"] / rel_den).round(2).astype(str) + "%"
    )
    combined["icf_below_2_5_ratio"] = (
        (100 * combined["icf_below_2_5_count"] / icf_den).round(2).astype(str) + "%"
    )

    return combined.reset_index()

# ===== 主程序 =====
input_tsv = "D470_part1_Jesse.tsv"  # TODO: 换成你的文件名

df = pd.read_csv(input_tsv, sep="\t", dtype=str)
result_df = evaluate_human_ranking(df)

print(result_df)
result_df.to_csv("Jesse_combined_eval_summary_single_file.tsv", sep="\t", index=False)


    model  valid_relevance_count  invalid_relevance_count  avg_relevance_rank  \
0  model1                     40                        1                1.32   
1  model2                     41                        0                1.34   
2  model3                     34                        5                1.50   
3  model4                     33                        6                1.64   
4  model5                     34                        8                1.82   

   rel_below_2_5_count  valid_icf_count  invalid_icf_count  avg_icf_rank  \
0                   37               40                  1          2.30   
1                   37               41                  0          2.66   
2                   28               34                  5          3.65   
3                   26               33                  6          3.94   
4                   23               34                  8          3.71   

   icf_below_2_5_count  rel_rank_1  ...  icf_rank_2  icf