In [5]:
# We'll read the TSV, detect the relevant columns by fuzzy matching,
# then compute total rows, empty counts, and percentages for each target column.

import pandas as pd
import numpy as np
import re
from pathlib import Path

# ===== 1) File path =====
# 如果你的脚本与文件在同一目录，直接用相对路径；
# 若在本环境（ChatGPT沙盒）运行，请改为：Path("/mnt/data/generated_sample - Test (LMC).tsv")
path = Path("generated_sample - Test (LMC).tsv")

# ===== 2) Read TSV =====
# keep_default_na=False：空字符串不会自动变 NaN；我们自行判断空值（strip 后为空）
df = pd.read_csv(
    path,
    sep="\t",
    dtype=str,
    keep_default_na=False,
    na_values=["", "NA", "N/A", "NaN"]
)

total_rows = len(df)

# ===== 3) Helper: fuzzy match column by keywords =====
def find_col(df_cols, keywords):
    # 规范化列名：合并多空白、转小写
    norm = {c: re.sub(r"\s+", " ", c).strip().lower() for c in df_cols}
    for col, n in norm.items():
        if all(k.lower() in n for k in keywords):
            return col
    return None

# ===== 4) Target columns identification =====
targets = {
    "Clinical Appropriateness": ["dialogue quality evaluation", "clinical appropriateness"],
    "ICF Consistency": ["dialogue quality evaluation", "icf consistency"],
    "Conversational Realism": ["dialogue quality evaluation", "conversational realism"],
}

found_cols = {}
for label, keys in targets.items():
    col = find_col(df.columns, keys)
    found_cols[label] = col

# ===== 5) Compute stats =====
rows = []
# 额外把一些常见“空占位”字符串当作空值
EMPTY_TOKENS = {"na", "n/a", "none", "null", "nan"}

for label, col in found_cols.items():
    if col is None:
        rows.append({
            "Dimension": label,
            "Column Found": False,
            "Column Name": None,
            "Total Rows": total_rows,
            "Empty (Pass) Count": None,
            "Empty (Pass) %": None,
            "Non-Empty (Fail/Flagged) Count": None,
            "Non-Empty (Fail/Flagged) %": None,
        })
        continue

    series = df[col].astype(str)

    def is_empty_val(x: str) -> bool:
        s = x.strip()
        if s == "":
            return True
        # 处理看似空的占位写法
        if s.lower() in EMPTY_TOKENS:
            return True
        return False

    is_empty = series.map(is_empty_val)
    empty_count = int(is_empty.sum())
    non_empty_count = int((~is_empty).sum())
    empty_pct = (empty_count / total_rows * 100.0) if total_rows > 0 else np.nan
    non_empty_pct = (non_empty_count / total_rows * 100.0) if total_rows > 0 else np.nan

    rows.append({
        "Dimension": label,
        "Column Found": True,
        "Column Name": col,
        "Total Rows": total_rows,
        "Empty (Pass) Count": empty_count,               # 合格（通过）数量
        "Empty (Pass) %": round(empty_pct, 2),           # 合格（通过）占比
        "Non-Empty (Fail/Flagged) Count": non_empty_count,
        "Non-Empty (Fail/Flagged) %": round(non_empty_pct, 2),
    })

summary_df = pd.DataFrame(rows)

# ===== 6) Save & print =====
# ===== 6) Save & print =====
out_path = Path("validation_pass_rates.csv")  # 改为你想保存的位置
summary_df.to_csv(out_path, index=False)

print(f"Total rows: {total_rows}")
print(summary_df.to_string(index=False))
print(f"\nSaved summary to: {out_path.resolve()}")


Total rows: 256
               Dimension  Column Found                                                                                                                      Column Name  Total Rows  Empty (Pass) Count  Empty (Pass) %  Non-Empty (Fail/Flagged) Count  Non-Empty (Fail/Flagged) %
Clinical Appropriateness          True Dialogue Quality Evaluation:Clinical Appropriateness  ✅ Validate all turns ✅ Mark problematic turn(s) that violate this criteria         256                 256          100.00                               0                        0.00
         ICF Consistency          True         Dialogue Quality Evaluation: ICF Consistency  ✅ Validate all turns ✅ Mark problematic turn(s) that violate this criteria         256                 194           75.78                              62                       24.22
  Conversational Realism          True  Dialogue Quality Evaluation: Conversational Realism  ✅ Validate all turns ✅ Mark problematic turn(s) that violate th