In [1]:
# load npz 
import numpy as np
import json

root = "/workspace/data/annotation"
full_path = root + "/full.json"
mini_path = root + "/mini.json"

full_data = json.load(open(full_path, 'r'))
mini_data = json.load(open(mini_path, 'r'))

FileNotFoundError: [Errno 2] No such file or directory: '/workspace/data/annotation/full.json'

In [None]:
import json
from collections import defaultdict

def organize_labels(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    summary = defaultdict(lambda: {"count": 0, "total_duration": 0})
    organized_results = []

    for entry in data:
        video_info = {
            "video_path": entry.get("video"),
            "annotation_id": entry.get("annotation_id"),
            "labels": []
        }
        
        # 遍历视频中的所有标注
        labels_list = entry.get("videoLabels", [])
        for label_item in labels_list:
            # 获取标签名称（通常列表里只有一个元素）
            label_names = label_item.get("timelinelabels", [])
            ranges = label_item.get("ranges", [])
            
            for r in ranges:
                start = r.get("start")
                end = r.get("end")
                duration = end - start
                
                for name in label_names:
                    # 添加到该视频的整理列表中
                    video_info["labels"].append({
                        "label": name,
                        "start": start,
                        "end": end,
                        "duration": duration
                    })
                    
                    # 更新全局统计信息
                    summary[name]["count"] += 1
                    summary[name]["total_duration"] += duration
        
        # 按开始时间排序
        video_info["labels"].sort(key=lambda x: x['start'])
        organized_results.append(video_info)

    return organized_results, summary

# --- 执行整理 ---
file_name = '/workspace/data/annotation/mini.json'  # 请确保文件名正确
results, stats = organize_labels(file_name)

# --- 打印整理后的结果 ---
print("=== 标签统计摘要 ===")
print(f"{'标签':<15} | {'出现次数':<10} | {'总持续时间'}")
print("-" * 45)
for label, info in stats.items():
    print(f"{label:<15} | {info['count']:<10} | {info['total_duration']}")

print("\n=== 视频详细标注明细 (前5条) ===")
for vid in results[:1]:  # 示例打印第一个视频
    print(f"\n视频文件: {vid['video_path']}")
    for l in vid['labels'][:10]:  # 示例打印前10个标签
        print(f"  [{l['start']:>5} - {l['end']:>5}] 动作: {l['label']:<10} (时长: {l['duration']})")
    print("  ...")

=== 标签统计摘要 ===
标签              | 出现次数       | 总持续时间
---------------------------------------------
left            | 3219       | 151533
right           | 2439       | 168421
down            | 4454       | 150484
up              | 737        | 31386
right_up        | 64         | 3365
right_down      | 90         | 3528
left_down       | 423        | 15604
left_up         | 226        | 13561

=== 视频详细标注明细 (前5条) ===

视频文件: /data/local-files/?d=mydata/drive_data/person_01_day_high_h265.mp4
  [  362 -   384] 动作: left       (时长: 22)
  [  384 -   456] 动作: right      (时长: 72)
  [  488 -   520] 动作: down       (时长: 32)
  [  579 -   622] 动作: down       (时长: 43)
  [  639 -   663] 动作: right      (时长: 24)
  [  665 -   694] 动作: down       (时长: 29)
  [  746 -   782] 动作: down       (时长: 36)
  [  795 -   846] 动作: down       (时长: 51)
  [  870 -   919] 动作: down       (时长: 49)
  [  953 -   993] 动作: down       (时长: 40)
  ...


In [None]:
# --- 按视频合并不同标注者的结果 ---
import json
import os

def merge_labels_by_video(input_json, output_dir="merged_videos"):
    # 1. 创建输出目录
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 2. 读取原始数据
    with open(input_json, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 3. 使用字典按视频名归组
    # 数据结构: { "视频名": [标注者A的数据, 标注者B的数据, ...] }
    video_groups = defaultdict(list)

    for entry in data:
        # 提取视频文件名
        raw_path = entry.get("video", "unknown")
        video_filename = raw_path.split('/')[-1] if '/' in raw_path else os.path.basename(raw_path)
        video_groups[video_filename].append(entry)

    # 4. 写入文件
    for video_name, annotations in video_groups.items():
        # 构建保存文件名（例如：person_01_day_high_h265.json）
        base_name = os.path.splitext(video_name)[0]
        output_path = os.path.join(output_dir, f"{base_name}.json")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # 构造最终的 JSON 结构
        # 包含视频名和对应的所有标注记录
        final_data = {
            "video_file": video_name,
            "total_annotators": len(annotations),
            "annotations": annotations  # 这里包含了不同 annotation_id 的完整内容
        }

        with open(output_path, 'w', encoding='utf-8') as out_f:
            json.dump(final_data, out_f, indent=4, ensure_ascii=False)
        
        print(f"成功合并: {video_name} (共有 {len(annotations)} 个人的标注结果)")
        print(f"保存路径: {output_path}")

# 执行
merge_labels_by_video('/workspace/data/annotation/mini.json', output_dir="/workspace/data/label")

成功合并: person_01_day_high_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_01_day_high_h265.json
成功合并: person_01_day_low_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_01_day_low_h265.json
成功合并: person_01_night_high_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_01_night_high_h265.json
成功合并: person_01_night_low_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_01_night_low_h265.json
成功合并: person_02_day_high_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_02_day_high_h265.json
成功合并: person_02_day_low_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_02_day_low_h265.json
成功合并: person_02_night_high_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_02_night_high_h265.json
成功合并: person_02_night_low_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_02_night_low_h265.json
成功合并: person_03_day_high_h265.mp4 (共有 3 个人的标注结果)
保存路径: /workspace/data/label/person_03_day_high_h265.json
成功合并: person_03_day_low_h265.mp4 (共有 3

In [3]:
# 有些annotation只打了4个标签，有些打了8个标签。需要统计一下

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
from pathlib import Path
from collections import Counter, defaultdict

def iter_labels_in_annotation(ann: dict):
    """Yield timelinelabel strings inside one annotation item."""
    for vl in ann.get("videoLabels", []) or []:
        for tl in (vl.get("timelinelabels", []) or []):
            if isinstance(tl, str) and tl.strip():
                yield tl.strip()

def scan(label_dir: Path):
    results = []
    for json_path in sorted(label_dir.rglob("*.json")):
        # robust read (utf-8 / utf-8-sig)
        try:
            text = json_path.read_text(encoding="utf-8")
            data = json.loads(text)
        except UnicodeDecodeError:
            text = json_path.read_text(encoding="utf-8-sig")
            data = json.loads(text)
        except Exception as e:
            print(f"[WARN] failed to read {json_path}: {e}")
            continue

        # file-level: annotator -> Counter(label -> count)
        per_annotator_counts: dict[str, Counter] = defaultdict(Counter)

        for ann in data.get("annotations", []) or []:
            aid = str(ann.get("annotator", "UNKNOWN"))
            for lb in iter_labels_in_annotation(ann):
                per_annotator_counts[aid][lb] += 1

        # make JSON-serializable summary
        per_annotator_summary = {}
        for aid, c in per_annotator_counts.items():
            per_annotator_summary[aid] = {
                "n_classes": len(c),
                "classes": sorted(c.keys()),
                "counts": dict(c.most_common()),   # label->count
                "n_total": int(sum(c.values())),
            }

        # union (optional)
        union_classes = sorted({lb for c in per_annotator_counts.values() for lb in c.keys()})

        results.append({
            "file": str(json_path.relative_to(label_dir.parent)),
            "annotators": sorted(per_annotator_summary.keys(), key=lambda x: (x=="UNKNOWN", x)),
            "per_annotator": per_annotator_summary,
            "union_classes": union_classes,
            "n_union_classes": len(union_classes),
        })

    return results


label_dir = Path("/workspace/data/multi_view_driver_action/label")
if not label_dir.exists():
    raise SystemExit(f"label folder not found: {label_dir}")

results = scan(label_dir)

# pretty print to console
for r in results:
    print(f"\n=== {r['file']} ===")
    for aid in r["annotators"]:
        s = r["per_annotator"][aid]
        print(f"- annotator {aid}: classes={s['n_classes']} total={s['n_total']}")
        print(f"  classes: {s['classes']}")
        print(f"  counts:  {s['counts']}")
    print(f"  union({r['n_union_classes']}): {r['union_classes']}")

out_path = "per_file_annotator_label_stats.json"
with open(out_path, 'w', encoding='utf-8') as out_f:
    json.dump(results, out_f, indent=4, ensure_ascii=False)
print(f"\n[INFO] saved summary to {out_path}")



=== label/person_01_day_high_h265.json ===
- annotator 3: classes=6 total=111
  classes: ['down', 'left', 'left_down', 'right', 'right_down', 'up']
  counts:  {'down': 81, 'right': 11, 'left': 11, 'up': 6, 'left_down': 1, 'right_down': 1}
- annotator 4: classes=6 total=85
  classes: ['down', 'left', 'left_down', 'left_up', 'right', 'right_down']
  counts:  {'down': 53, 'left_down': 13, 'right': 9, 'left': 8, 'left_up': 1, 'right_down': 1}
- annotator 5: classes=7 total=125
  classes: ['down', 'left', 'left_down', 'right', 'right_down', 'right_up', 'up']
  counts:  {'down': 82, 'right': 17, 'left': 15, 'up': 8, 'right_up': 1, 'right_down': 1, 'left_down': 1}
  union(8): ['down', 'left', 'left_down', 'left_up', 'right', 'right_down', 'right_up', 'up']

=== label/person_01_day_low_h265.json ===
- annotator 3: classes=4 total=113
  classes: ['down', 'left', 'left_down', 'right']
  counts:  {'down': 83, 'right': 18, 'left': 11, 'left_down': 1}
- annotator 4: classes=5 total=96
  classes: [