## Install package

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

## VCSum Dataset

In [None]:
# prompt: use git to got the https://github.com/hahahawu/VCSum.git

!git clone https://github.com/hahahawu/VCSum.git

Cloning into 'VCSum'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 43 (delta 13), reused 33 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (43/43), 12.88 MiB | 16.34 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [None]:
%cd /content/VCSum

/content/VCSum


In [None]:
#!/usr/bin/env python3
"""
build_hf_dataset.py
把 vcsum_data/ 轉成 HuggingFace DatasetDict（train / dev / test）。
"""

import json
import pathlib
from typing import Any, Dict, List, Tuple

import datasets         # pip install datasets

# ──────────────────────
ROOT   = pathlib.Path("vcsum_data")
SPLITS = ["train", "dev", "test"]

FEATURES = datasets.Features({
    "id"        : datasets.Value("string"),
    "av_num"    : datasets.Value("int32"),
    "context"   : datasets.Value("string"),
    "summary"   : datasets.Value("string"),
    "agenda"    : datasets.Sequence(datasets.Value("string")),
    "discussion": datasets.Sequence(datasets.Value("string")),
    "eos_index" : datasets.Sequence(datasets.Value("int32")),
    "highlights": datasets.Sequence(datasets.Value("int8")),
    "split"     : datasets.Value("string"),
})

# ────────────────────── 讀檔工具
def load_jsonl(path: pathlib.Path) -> List[Dict[str, Any]]:
    with path.open(encoding="utf-8") as fh:
        return [json.loads(l) if l.strip() else {}
                for l in fh]

# ────────────────────── 型別修正工具
def ensure_str_list(obj: Any) -> List[str]:
    if obj is None:
        return []
    if isinstance(obj, list):
        flat = []
        for x in obj:
            flat.extend(ensure_str_list(x))      # 遞迴處理巢狀 list
        return [str(s) for s in flat]
    return [str(obj)]

def ensure_int_list(obj: Any) -> List[int]:
    flat: List[int] = []
    def _walk(o):
        if o is None:
            return
        if isinstance(o, list):
            for item in o:
                _walk(item)
        else:
            try:
                flat.append(int(o))
            except (TypeError, ValueError):
                pass
    _walk(obj)
    return flat

def context_to_string(ctx: Any) -> str:
    if isinstance(ctx, str):
        return ctx
    # 將最終元素攤平成 str，再用換行符連接句子
    lines = ensure_str_list(ctx)
    return "\n".join(lines)

# ────────────────────── 建立全文 / highlight 對照表
def build_lookup_maps() -> Tuple[
    Dict[Tuple[str, int], str],
    Dict[Tuple[str, int], List[int]]
]:
    ctx_map = {
        (d["id"], d["av_num"]): context_to_string(d["context"])
        for d in load_jsonl(ROOT / "overall_context.txt")
    }
    hl_map = {
        (d["id"], d["av_num"]): ensure_int_list(d["highlights"])
        for d in load_jsonl(ROOT / "overall_highlights.txt")
    }
    return ctx_map, hl_map

# ────────────────────── 主流程
def main() -> None:
    ctx_map, hl_map = build_lookup_maps()
    ds_dict: Dict[str, datasets.Dataset] = {}

    for sp in SPLITS:
        long_items  = {(d["id"], d["av_num"]): d
                       for d in load_jsonl(ROOT / f"long_{sp}.txt")}
        short_items = {(d["id"], d["av_num"]): d
                       for d in load_jsonl(ROOT / f"short_{sp}.txt")}

        rows = []
        for key, s in short_items.items():
            l = long_items.get(key, {})
            rows.append({
                "id"        : key[0],
                "av_num"    : int(key[1]),
                "context"   : ctx_map.get(key,
                               context_to_string(s.get("context", l.get("context", "")))),
                "summary"   : l.get("summary", ""),
                "agenda"    : ensure_str_list(s.get("agenda")),
                "discussion": ensure_str_list(s.get("discussion")),
                "eos_index" : ensure_int_list(s.get("eos_index")),
                "highlights": hl_map.get(key, []),
                "split"     : sp,
            })

        ds_dict[sp] = datasets.Dataset.from_list(rows).cast(FEATURES)

    dataset = datasets.DatasetDict(ds_dict)
    print(dataset)           # 完整結構
    print(dataset["train"][0])  # 範例檢查

    # 上傳到 🤗 Hub（可選）
    # dataset.push_to_hub("your_username/vcsum-meeting-summary",
    #                     max_shard_size="500MB")

if __name__ == "__main__":
    main()


Casting the dataset:   0%|          | 0/1088 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/135 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/136 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'av_num', 'context', 'summary', 'agenda', 'discussion', 'eos_index', 'highlights', 'split'],
        num_rows: 1088
    })
    dev: Dataset({
        features: ['id', 'av_num', 'context', 'summary', 'agenda', 'discussion', 'eos_index', 'highlights', 'split'],
        num_rows: 135
    })
    test: Dataset({
        features: ['id', 'av_num', 'context', 'summary', 'agenda', 'discussion', 'eos_index', 'highlights', 'split'],
        num_rows: 136
    })
})
{'id': '1_0', 'av_num': 249032688, 'context': '下一位要向大家重磅介绍的是我们云南凤云商贸有限公司董事长丁凤云。\n丁总欢迎您。\n其实三位老师我不知道大家有没有人认识，应该是认识的会比较偏多一些，因为三位老师是我们珠宝界定海神针一般存在的人物。\n今天到此就是为了向大家宣传和推广咱们这个石林海域，也是我们一种近年新发现的新的意识，正是我们珠宝人这一种不断探索不止创新的这种精神，让我们无此又一次的发掘新品。\n我在这里也代表广大的玉石爱好者，感谢三位老师的匠心精神，也感谢三位老师对推广石英彩玉文化而做出的贡献。\n谢谢。\n您好的，那今天我们的论坛在这里的正式开始了，今天在我们的现场是有几个小问题要问到我们的老师，首先其实我很好奇，就是我们云南是一个预持旅游的发展大势。\n那么立足于这样的一个形式，挖掘石林彩域的文化价值及推广石林彩域，您认为是应该如何来进行呢？\n老师可以发表一下自己的观点。\n石英彩玉在玉石这个大家当中是一个只有十几年年龄的新的一种。\n我旁边这位是我们中国的翡翠话题，在我进入这个行