In [None]:
# convert_excel2json.ipynb
import pandas as pd
import json
from collections import defaultdict

# 读 Excel
df = pd.read_excel('sad.xlsx')
print("Excel 列名：", list(df.columns))

# 如果 Excel 里的列名就是 filename / actor_ID / start / end，这里可以不改
df = df.rename(columns={
    'filename': 'filename',
    'actor_ID': 'actor_ID',
    'start': 'start',
    'end': 'end',
})

# 按出现顺序自动给同一 filename 编 episode_id：1,2,3,...
episode_counter = defaultdict(int)

output_path = 'f:/fsy/project/kae_process/kae_text/sad_annotations.jsonl'
with open(output_path, 'w', encoding='utf-8') as f:
    for _, row in df.iterrows():
        base_fname = str(row["filename"])
        episode_counter[base_fname] += 1
        episode_id = episode_counter[base_fname]

        # 生成新的 filename：例如 "kkk" -> "kkkE1", "kkkE2", ...
        new_fname = f"{base_fname}E{episode_id}"

        # 处理 start / end 为空的情况
        raw_start = row["start"]
        raw_end = row["end"]

        if pd.isna(raw_start):
            start_val = 0
        else:
            start_val = int(raw_start)

        if pd.isna(raw_end):
            end_val = 5000
        else:
            end_val = int(raw_end)

        record = {
            "filename": base_fname,              # 用原始名字
            "newname": new_fname,              # 用重编号后的名字
            "actor_ID": str(row["actor_ID"]),
            "start": start_val,
            "end": end_val,
            "episode_id": episode_id,          # 也保留数值形式
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("已生成标注文件：", output_path)


Excel 列名： ['filename', 'actor_ID', 'emotion', 'description', 'start', 'end']
已生成标注文件： f:/fsy/project/kae_process/kae_text/sad_annotations.jsonl


In [None]:
# filepath: f:\fsy\project\kae_process\process_bvh_by_excel.ipynb
# split bvh（修正版）
import os
import json
import re
from pathlib import Path


def load_annotations(annotation_file):
    """读取 jsonl 标注文件，每行一个 JSON 对象"""
    annotations = []
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                anno = json.loads(line)
                annotations.append(anno)
            except json.JSONDecodeError as e:
                print(f"第 {line_num} 行 JSON 解析失败: {e}")
    print(f"共读取到 {len(annotations)} 条标注")
    return annotations


def split_bvh_by_annotation(annotation_file, bvh_base_dir, output_base_dir):
    annotations = load_annotations(annotation_file)
    if not annotations:
        print("没有有效标注，退出")
        return

    bvh_base_dir = Path(bvh_base_dir)
    output_base_dir = Path(output_base_dir)
    output_base_dir.mkdir(parents=True, exist_ok=True)

    processed = 0

    for i, anno in enumerate(annotations, 1):
        actor_id = anno["actor_ID"]      # 例如 "F01"
        src_name = anno["filename"]      # 原始 BVH 文件名（无扩展名），例如 "F01SA0V2"
        new_name = anno["newname"]       # 分段后 BVH 文件名（无扩展名），例如 "F01SA0V2E1"
        start = int(anno["start"])
        end = int(anno["end"])

        print(f"\n[{i}/{len(annotations)}] 处理 {src_name}.bvh -> {new_name}.bvh  [{start}, {end}]")

        # 原始 BVH 路径：bvh_base_dir / filename.bvh
        bvh_path = bvh_base_dir / f"{src_name}.bvh"
        if not bvh_path.exists():
            print(f"  找不到 BVH 文件: {bvh_path}")
            continue

        try:
            with open(bvh_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()
        except Exception as e:
            print(f"  读取 BVH 失败: {bvh_path}  错误: {e}")
            continue

        # 找到 MOTION 行
        motion_pos = content.find("MOTION")
        if motion_pos == -1:
            print(f"  文件中没有 MOTION 段: {bvh_path}")
            continue

        # header: 从文件开头到 "MOTION" 这一行之前（不含 "MOTION"）
        # motion_block: 从 "MOTION" 开始到文件结尾
        header_text = content[:motion_pos].rstrip("\n")
        motion_text = content[motion_pos:].lstrip("\n")  # 以 "MOTION" 开头

        motion_lines = motion_text.split("\n")

        # 第一行应该是 "MOTION"
        if not motion_lines[0].strip().startswith("MOTION"):
            print(f"  MOTION 段结构异常: 第一行不是 MOTION -> {motion_lines[0]!r}")
            continue

        # 在 MOTION 块内部找 Frames 和 Frame Time
        frames_line_idx = -1
        frame_time_idx = -1
        for idx, line in enumerate(motion_lines):
            line_strip = line.strip().lower()
            if line_strip.startswith("frames:"):
                frames_line_idx = idx
            elif line_strip.startswith("frame time:"):
                frame_time_idx = idx

        if frames_line_idx == -1 or frame_time_idx == -1:
            print(f"  未找到 Frames 或 Frame Time 行: {bvh_path}")
            continue

        # 解析总帧数
        frames_match = re.search(r"frames:\s*(\d+)", motion_lines[frames_line_idx], re.IGNORECASE)
        if not frames_match:
            print(f"  无法解析帧数: {motion_lines[frames_line_idx]}")
            continue

        total_frames = int(frames_match.group(1))
        motion_data_start = frame_time_idx + 1
        motion_data = motion_lines[motion_data_start: motion_data_start + total_frames]

        if len(motion_data) < total_frames:
            print(f"  警告: 实际运动数据行数({len(motion_data)}) < Frames({total_frames})，按实际行数为准")
            total_frames = len(motion_data)

        # 处理 end == 5000：截到最后一帧
        if end == 5000:
            end_idx = total_frames - 1
        else:
            end_idx = min(end, total_frames - 1)

        # 按 [start, end_idx]（包含 end_idx）截取
        if start < 0 or start > end_idx:
            print(f"  无效帧区间 start={start}, end={end_idx}, total_frames={total_frames}")
            continue

        segment_data = motion_data[start: end_idx + 1]
        new_frames = len(segment_data)

        # 重新构造 MOTION 块（从 "MOTION" 开始）
        motion_header_lines = []
        motion_header_lines.append("MOTION")
        motion_header_lines.append(f"Frames: {new_frames}")
        # 使用原来的 Frame Time 行（只改 Frames）
        motion_header_lines.append(motion_lines[frame_time_idx].strip())

        new_motion_block = "\n".join(motion_header_lines + segment_data)

        # 拼回完整 BVH 文本
        new_bvh_text = header_text + "\n" + new_motion_block + "\n"

        # 输出路径：output_base_dir / newname.bvh
        out_path = output_base_dir / f"{new_name}.bvh"

        try:
            with open(out_path, "w", encoding="utf-8") as f_out:
                f_out.write(new_bvh_text)
            processed += 1
            print(f"  已保存: {out_path}")
        except Exception as e:
            print(f"  写出 BVH 失败: {out_path}  错误: {e}")

    print("\n" + "=" * 50)
    print(f"处理完成，成功保存 {processed}/{len(annotations)} 个片段")
    print(f"输出目录: {output_base_dir}")
    print("=" * 50)


def main():
    annotation_file = r"F:\fsy\project\kae_process\kae_text\sad_annotations.jsonl"
    bvh_base_dir = r"F:\fsy\project\sad"          # 原始 BVH 目录（里面直接是 F01SA0V2.bvh 这种）
    output_dir = r"F:\fsy\project\kae_process\split_bvh\sad"

    os.makedirs(output_dir, exist_ok=True)

    print("开始分割 BVH ...")
    print(f"标注文件: {annotation_file}")
    print(f"BVH 根目录: {bvh_base_dir}")
    print(f"输出目录: {output_dir}")

    split_bvh_by_annotation(annotation_file, bvh_base_dir, output_dir)


if __name__ == "__main__":
    main()

开始分割 BVH ...
标注文件: F:\fsy\project\kae_process\kae_text\sad_annotations.jsonl
BVH 根目录: F:\fsy\project\sad
输出目录: F:\fsy\project\kae_process\split_bvh\sad
共读取到 115 条标注

[1/115] 处理 F01/F01SA0V2.bvh -> F01SA0V2E1.bvh  [130, 407]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA0V2E1.bvh

[2/115] 处理 F01/F01SA0V2.bvh -> F01SA0V2E2.bvh  [407, 820]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA0V2E2.bvh

[3/115] 处理 F01/F01SA1V1.bvh -> F01SA1V1E1.bvh  [100, 557]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA1V1E1.bvh

[4/115] 处理 F01/F01SA1V1.bvh -> F01SA1V1E2.bvh  [557, 1030]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA1V1E2.bvh

[5/115] 处理 F01/F01SA1V3.bvh -> F01SA1V3E1.bvh  [0, 816]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA1V3E1.bvh

[6/115] 处理 F01/F01SA3V1.bvh -> F01SA3V1E1.bvh  [0, 470]
  已保存: F:\fsy\project\kae_process\split_bvh\sad\F01SA3V1E1.bvh

[7/115] 处理 F01/F01SA3V1.bvh -> F01SA3V1E2.bvh  [470, 596]
  已保存: F:\fsy\project\kae_process\split_bvh\sad