## install package

In [1]:
!pip install -U datasets

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-3.6.0-py3-none-any.whl (491 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.2.0
    Uninstalling datasets-3.2.0:
      Successfully uninstalled datasets-3.2.0
Successfully installed datasets-3.6.0


In [2]:
!pip install fsspec==2023.9.2

Collecting fsspec==2023.9.2
  Using cached fsspec-2023.9.2-py3-none-any.whl.metadata (6.7 kB)
Using cached fsspec-2023.9.2-py3-none-any.whl (173 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
Successfully installed fsspec-2023.9.2


In [3]:
!pip install -U transformers

Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.31.4-py3-none-any.whl.metadata (13 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached huggingface_hub-0.31.4-py3-none-any.whl (489 kB)
Installing collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 0.27.1
    Uninstalling huggingface_hub-0.27.1:
      Successfully uninstalled huggingface_hub-0.27.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.0
    Uninstalling transformers-4.48.0:
      Successfully uninstalled transformers-4.48.0
Successfully installed huggingface-hub-0.31.4 transformers-4.51.3


In [4]:
!pip install opencc

Collecting opencc
  Using cached OpenCC-1.1.9-cp311-cp311-win_amd64.whl.metadata (14 kB)
Using cached OpenCC-1.1.9-cp311-cp311-win_amd64.whl (1.8 MB)
Installing collected packages: opencc
Successfully installed opencc-1.1.9


In [12]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.1 MB 6.1 MB/s eta 0:00:02
   ------- -------------------------------- 2.1/11.1 MB 5.1 MB/s eta 0:00:02
   -------- ------------------------------- 2.4/11.1 MB 5.0 MB/s eta 0:00:02
   -------------- ------------------------- 3.9/11.1 MB 4.7 MB/s eta 0:00:02
   ------------------- -------------------- 5.5/11.1 MB 5.4 MB/s eta 0:00:02
   -------------------------- ------------- 7.3/11.1 MB 6.0 MB/s eta 0:00:01
   

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.6.3 requires patsy>=0.5.1, which is not installed.
category-encoders 2.6.3 requires statsmodels>=0.9.0, which is not installed.
pyod 1.1.3 requires numba>=0.51, which is not installed.
sktime 0.28.0 requires scikit-base<0.8.0, which is not installed.
tbats 1.1.3 requires pmdarima, which is not installed.
sktime 0.28.0 requires numpy<1.27,>=1.21, but you have numpy 2.2.2 which is incompatible.
sktime 0.28.0 requires scikit-learn<1.5.0,>=0.24, but you have scikit-learn 1.6.1 which is incompatible.


In [6]:
from huggingface_hub import login
login()  # 跳出提示，貼上你的 Token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## dataset prepare

In [7]:
from datasets import load_dataset, Dataset
import pandas as pd
import itertools
import opencc
import random

# 1. streaming 抽樣 function
def sample_streaming(dataset_name, split, num_samples):
    stream = load_dataset(dataset_name, split=split, streaming=True)
    sampled = list(itertools.islice(stream, num_samples))
    df = pd.DataFrame(sampled)
    col = "text" if "text" in df.columns else df.columns[0]
    df = df[[col]].rename(columns={col: "text"})
    return df

# 2. 分別抽樣
df_trad = sample_streaming("voidful/fineweb-zhtw", "train", 55000)
df_trad["label"] = 1

df_simp = sample_streaming("opencsg/chinese-fineweb-edu", "train", 85000)
df_simp["label"] = 0

# 3. 標記已有 text，建立集合
used_texts = set(df_trad["text"]).union(set(df_simp["text"]))

# 4. 批次 streaming，過濾全新文本，直到收集夠多新資料
def sample_unique_streaming(dataset_name, split, num_samples, exclude_texts):
    stream = load_dataset(dataset_name, split=split, streaming=True)
    result = []
    for sample in stream:
        t = sample["text"] if "text" in sample else list(sample.values())[0]
        if t not in exclude_texts:
            result.append(t)
            if len(result) >= num_samples:
                break
    return pd.DataFrame({"text": result})

# 5. 重新 streaming 不重複的混合資料
df_mixed_raw = sample_unique_streaming(
    "voidful/fineweb-zhtw", "train", 40000, used_texts
)
converter = opencc.OpenCC('t2s.json')

def random_hybrid_segments(trad_text, segment_num=2):
    trad_chars = list(trad_text)
    simp_chars = list(converter.convert(trad_text))
    seg_lens = [max(1, len(trad_chars) // (segment_num * 2))] * segment_num
    start_points = sorted(random.sample(range(len(trad_chars)), segment_num))
    hybrid = trad_chars.copy()
    for idx in start_points:
        for i in range(idx, min(idx + seg_lens[0], len(trad_chars))):
            hybrid[i] = simp_chars[i]
    return "".join(hybrid)

df_mixed_raw["text"] = df_mixed_raw["text"].apply(lambda x: random_hybrid_segments(x, segment_num=3))
df_mixed_raw["label"] = 2
df_mixed = df_mixed_raw

# 6. 合併&亂數排序
df_total = pd.concat([df_trad, df_simp, df_mixed], ignore_index=True) \
             .sample(frac=1.0, random_state=42) \
             .reset_index(drop=True)

# 7. 轉 Hugging Face Dataset
dataset = Dataset.from_pandas(df_total[["text", "label"]])


README.md:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/19.6k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Resolving data files:   0%|          | 0/308 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

In [8]:
# prompt:  dataset.select label == 2 range 5

a5 = dataset.filter(lambda x: x['label'] == 2).select(range(5))
for i, ex in enumerate(a5):
    print(f"例 {i+1} (label={ex['label']}):")
    print(ex["text"][:100], "...")  # 只看前 100 字
    print("-"*40)

Filter:   0%|          | 0/180000 [00:00<?, ? examples/s]

例 1 (label=2):
又到了跟老婆約定每年出國渡假的時候了
可憐我的荷包又要大失血了~
為了排行程小弟可是耗費苦心啊
又要安排行程，還得預定住宿房間
不過今年就不用像以往那麼麻煩了
因為小弟找到了一家訂房網站，不但能訂房間 ...
----------------------------------------
例 2 (label=2):
臨危受命 「橋」楚出擊
2015-03-08 記者 陳亞柔 報導
此次乙未梅竹橋藝正式賽進入第二天，清大卻發生選手遲到而無法順利開始比賽的緊急狀況，臨時換上選手做為替補，此次臨陣上場的選手，正是目前就 ...
----------------------------------------
例 3 (label=2):
『掌中驚奇』─傳統布袋戲文化藝術體驗營」亦宛然7/10-7/11遊學台灣活動於5/16開始報名
16, May 2010 11:26
台湾传统布袋戏是极具活力与独特性的表演艺术，亦为常民生活文化中极为 ...
----------------------------------------
例 4 (label=2):
- 遠東銀行債務協商 和民間借貸利息行情必看的貸款知識
- 循環理財貸款 和民間小额借款找这家贷最划算
- 贷款买车 和首购贷款条件贷款重点相关资讯
- 债务协商会影响信用 和缺钱怎么办免费银行利率试 ...
----------------------------------------
例 5 (label=2):
國際級保育類動物西伯利亞小白鶴來台停留150天，日前因三立電視台劇組到小白鶴的停留處附近拍《聽見幸福》，發出救護車鳴笛和巨大槍響，小白鶴受到驚嚇飛走了，事後發現劇組沒有申請路權就封路拍攝，讓網友氣得直 ...
----------------------------------------


In [9]:
# 已經合併成 dataset 之後
first5 = dataset.select(range(5))
for i, ex in enumerate(first5):
    print(f"例 {i+1} (label={ex['label']}):")
    print(ex["text"][:100], "...")  # 只看前 100 字
    print("-"*40)


例 1 (label=0):
在动物学分类上,猪属于哺乳动物纲,偶蹄目,非反刍亚目,猪科,猪亚科,猪属,猪属中包括野猪和家猪.据"中国畜禽遗传资源志・猪志"记载,现代家猪的祖先并不是现代野猪,而是古代野猪.古代野猪的起源不是一个中 ...
----------------------------------------
例 2 (label=0):
春耕时节,大地回暖.连日来,富川瑶族自治县的田间地头,随处可见村民们忙着翻地,起垄,盖膜种植芋头的忙碌景象.
近年来,富川将香芋种植作为乡村振兴新兴主导产业来抓,及时出台补助鼓励政策,规定连片规模种植 ...
----------------------------------------
例 3 (label=1):
notifications發表彈幕時請顧及他人觀感，若被檢舉的次數過多，將會自動進入系統黑名單。
上架時間：2016-01-08 11:21:25
人類運用有著高度進化情報技術「瑪娜」的魔法之力克服了 ...
----------------------------------------
例 4 (label=1):
行政院今 (20) 日下午召開前瞻基礎建設記者會，但僅發布水環境建設與綠能建設等部分內容，並未公布投資細項，引起外界質疑相關內容「完全讓人聽不懂，讓人看不到想像願景」的聲浪，加上政院今天也無法說明前瞻 ...
----------------------------------------
例 5 (label=0):
源趋紧,环保逐渐成为"硬杠杠",畜牧业发展迅猛的同时,如何走绿色生态之路?9月20日,在第28届中原畜牧业交易博览会期间举行的生态畜牧业发展高峰论坛上,与会专家和政企界人士纷纷建言献策,探讨畜牧业发展 ...
----------------------------------------


## tokenizer

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = dataset.to_pandas()
# 1. 讀入資料
df_train, df_val = train_test_split(df, test_size=0.2, stratify=df_total['label'], random_state=42)

# 2. 轉換為 HuggingFace DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val)
})

# 3. Tokenize（保留欄位）
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

encoded_dataset = raw_datasets.map(tokenize, batched=True)


Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

## Finetune

In [14]:
labels = set(dataset["label"])
print("所有 label:", labels)
print("最大:", max(labels), "最小:", min(labels))

所有 label: {0, 1, 2}
最大: 2 最小: 0


In [15]:
# 若使用 pandas DataFrame
labels_set = set(encoded_dataset["train"]["label"])
print("所有 label:", labels_set)
assert labels_set.issubset({0, 1, 2}), "labels 有超出 0/1/2 的數值"

import numpy as np
print("有無 NaN:", np.isnan(encoded_dataset["train"]["label"]).any())
print("dtype:", type(encoded_dataset["train"]["label"][0]))
encoded_dataset = encoded_dataset.map(lambda x: {"label": int(x["label"])})

所有 label: {0, 1, 2}
有無 NaN: False
dtype: <class 'int'>


Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("ckiplab/bert-base-chinese", num_labels=3, hidden_dropout_prob=0.3)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./model_ckpt",
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    weight_decay=0.08,
    warmup_ratio=0.1,
    save_total_limit=2,
    fp16=True,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()


pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.9171
1000,0.3886
1500,0.3022
2000,0.2895
2500,0.2957
3000,0.2829
3500,0.2808
4000,0.2804
4500,0.2767
5000,0.2923


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

TrainOutput(global_step=90000, training_loss=0.4242975630018446, metrics={'train_runtime': 8653.6415, 'train_samples_per_second': 83.202, 'train_steps_per_second': 10.4, 'total_flos': 4.736041519104e+16, 'train_loss': 0.4242975630018446, 'epoch': 5.0})

In [17]:
model.save_pretrained("./bert-zh-tw-classifier")
tokenizer.save_pretrained("./bert-zh-tw-classifier")

('./bert-zh-tw-classifier\\tokenizer_config.json',
 './bert-zh-tw-classifier\\special_tokens_map.json',
 './bert-zh-tw-classifier\\vocab.txt',
 './bert-zh-tw-classifier\\added_tokens.json',
 './bert-zh-tw-classifier\\tokenizer.json')

In [18]:
trainer.save_model("./model_ckpt")  # 這會同時儲存 model 和 config.json
tokenizer.save_pretrained("./model_ckpt")


('./model_ckpt\\tokenizer_config.json',
 './model_ckpt\\special_tokens_map.json',
 './model_ckpt\\vocab.txt',
 './model_ckpt\\added_tokens.json',
 './model_ckpt\\tokenizer.json')

In [31]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

classifier = pipeline("text-classification", model="./model_ckpt", tokenizer="./model_ckpt")
classifier("欢迎来到台灣")


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.6093946099281311}]

In [20]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.375386118888855, 'eval_runtime': 104.6627, 'eval_samples_per_second': 343.962, 'eval_steps_per_second': 42.995, 'epoch': 5.0}


## PUSH

In [22]:
!pip install --upgrade huggingface_hub transformers
from huggingface_hub import notebook_login

notebook_login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from transformers import AutoTokenizer

# 儲存模型與 tokenizer
model.save_pretrained("./model_ckpt")
tokenizer.save_pretrained("./model_ckpt")

# 推送到 Hugging Face
from huggingface_hub import HfApi, create_repo, upload_folder

# repo_id 建議格式："username/model-name"
repo_id = "renhehuang/bert-base-chinese-traditional-classifier-v2-180k"
create_repo(repo_id, private=False)  # 若要私有改成 private=True

# 將整個資料夾上傳
upload_folder(
    folder_path="./model_ckpt",
    repo_id=repo_id,
    commit_message="Initial commit"
)


scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/818M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Upload 14 LFS files:   0%|          | 0/14 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/818M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/renhehuang/bert-base-chinese-traditional-classifier-v2-180k/commit/e58345b78b57336f8b57c2a3c977e1b031118d90', commit_message='Initial commit', commit_description='', oid='e58345b78b57336f8b57c2a3c977e1b031118d90', pr_url=None, repo_url=RepoUrl('https://huggingface.co/renhehuang/bert-base-chinese-traditional-classifier-v2-180k', endpoint='https://huggingface.co', repo_type='model', repo_id='renhehuang/bert-base-chinese-traditional-classifier-v2-180k'), pr_revision=None, pr_num=None)

## test

In [25]:
import random
import pandas as pd

trad_samples = [
    "這是一個繁體中文的測試句子。", "歡迎來到台灣！", "請問您需要什麼協助？",
    "我們今天學習了很多知識。", "天氣很好，適合出門散步。", "你喜歡吃什麼台灣小吃？",
    "這本書很有趣，推薦你看看。", "捷運系統非常方便。", "週末一起去看電影好嗎？", "我在圖書館讀書。"
]
simp_samples = [
    "这是一个简体中文的测试句子。", "欢迎来到中国！", "请问您需要什么帮助？",
    "我们今天学到了很多知识。", "天气很好，适合出去散步。", "你喜欢吃什么中国小吃？",
    "这本书很有趣，推荐你看看。", "地铁系统非常方便。", "周末一起去看电影好吗？", "我在图书馆学习。"
]

def random_samples(source, n):
    return [random.choice(source) for _ in range(n)]

def mixed_samples(trad_source, simp_source, n, trad_ratio):
    n_trad = int(n * trad_ratio)
    n_simp = n - n_trad
    return random_samples(trad_source, n_trad) + random_samples(simp_source, n_simp)

trad_data = random_samples(trad_samples, 50)
simp_data = random_samples(simp_samples, 50)
mix1_data = mixed_samples(trad_samples, simp_samples, 50, trad_ratio=0.7)
random.shuffle(mix1_data)
mix2_data = mixed_samples(trad_samples, simp_samples, 50, trad_ratio=0.3)
random.shuffle(mix2_data)

test_df = pd.DataFrame({
    "group": ["trad"]*50 + ["simp"]*50 + ["mix1"]*50 + ["mix2"]*50,
    "text": trad_data + simp_data + mix1_data + mix2_data
})


In [26]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./model_ckpt", tokenizer="./model_ckpt", device=0)  # CUDA
results = [classifier(t)[0] for t in test_df["text"]]
test_df["pred_label"] = [r["label"] for r in results]
test_df["score"] = [r["score"] for r in results]


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [29]:
results

[{'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093936562538147},
 {'label': 'LABEL_1', 'score': 0.609393298625946},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093936562538147},
 {'label': 'LABEL_1', 'score': 0.6093936562538147},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093933582305908},
 {'label': 'LABEL_1', 'score': 0.6093934774398804},
 {'label': 'LABEL_1', 'score': 0.6093936562538147},
 {'label': 'L

In [28]:
test_df

Unnamed: 0,group,text,pred_label,score
0,trad,歡迎來到台灣！,LABEL_1,0.609393
1,trad,這是一個繁體中文的測試句子。,LABEL_1,0.609394
2,trad,天氣很好，適合出門散步。,LABEL_1,0.609393
3,trad,我們今天學習了很多知識。,LABEL_1,0.609393
4,trad,我們今天學習了很多知識。,LABEL_1,0.609393
...,...,...,...,...
195,mix2,这本书很有趣，推荐你看看。,LABEL_1,0.609394
196,mix2,地铁系统非常方便。,LABEL_1,0.609446
197,mix2,歡迎來到台灣！,LABEL_1,0.609393
198,mix2,歡迎來到台灣！,LABEL_1,0.609393


In [27]:
summary = test_df.groupby("group")["pred_label"].value_counts().unstack(fill_value=0)
print("分類結果分布：")
print(summary)


分類結果分布：
pred_label  LABEL_0  LABEL_1
group                       
mix1              1       49
mix2              2       48
simp              3       47
trad              0       50


In [None]:
score_stats = test_df.groupby("group")["score"].agg(["mean", "std", "min", "max"])
print("\n信心分數統計：")
print(score_stats)
