## install package

In [1]:
!pip install -U datasets



In [2]:
!pip install fsspec==2023.9.2



In [3]:
!pip install -U transformers



In [4]:
!pip install opencc



## dataset prepare

In [5]:
from datasets import load_dataset, Dataset
import pandas as pd
import itertools
import opencc
import random

# 1. streaming 抽樣 function
def sample_streaming(dataset_name, split, num_samples):
    stream = load_dataset(dataset_name, split=split, streaming=True)
    sampled = list(itertools.islice(stream, num_samples))
    df = pd.DataFrame(sampled)
    col = "text" if "text" in df.columns else df.columns[0]
    df = df[[col]].rename(columns={col: "text"})
    return df

# 2. 分別抽樣
df_trad = sample_streaming("voidful/fineweb-zhtw", "train", 15000)
df_trad["label"] = 1

df_simp = sample_streaming("opencsg/chinese-fineweb-edu", "train", 25000)
df_simp["label"] = 0

# 3. 標記已有 text，建立集合
used_texts = set(df_trad["text"]).union(set(df_simp["text"]))

# 4. 批次 streaming，過濾全新文本，直到收集夠多新資料
def sample_unique_streaming(dataset_name, split, num_samples, exclude_texts):
    stream = load_dataset(dataset_name, split=split, streaming=True)
    result = []
    for sample in stream:
        t = sample["text"] if "text" in sample else list(sample.values())[0]
        if t not in exclude_texts:
            result.append(t)
            if len(result) >= num_samples:
                break
    return pd.DataFrame({"text": result})

# 5. 重新 streaming 不重複的混合資料
df_mixed_raw = sample_unique_streaming(
    "voidful/fineweb-zhtw", "train", 10000, used_texts
)
converter = opencc.OpenCC('t2s.json')

def random_hybrid_segments(trad_text, segment_num=2):
    trad_chars = list(trad_text)
    simp_chars = list(converter.convert(trad_text))
    seg_lens = [max(1, len(trad_chars) // (segment_num * 2))] * segment_num
    start_points = sorted(random.sample(range(len(trad_chars)), segment_num))
    hybrid = trad_chars.copy()
    for idx in start_points:
        for i in range(idx, min(idx + seg_lens[0], len(trad_chars))):
            hybrid[i] = simp_chars[i]
    return "".join(hybrid)

df_mixed_raw["text"] = df_mixed_raw["text"].apply(lambda x: random_hybrid_segments(x, segment_num=3))
df_mixed_raw["label"] = 2
df_mixed = df_mixed_raw

# 6. 合併&亂數排序
df_total = pd.concat([df_trad, df_simp, df_mixed], ignore_index=True) \
             .sample(frac=1.0, random_state=42) \
             .reset_index(drop=True)

# 7. 轉 Hugging Face Dataset
dataset = Dataset.from_pandas(df_total[["text", "label"]])


Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/308 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/322 [00:00<?, ?it/s]

In [6]:
# prompt:  dataset.select label == 2 range 5

a5 = dataset.filter(lambda x: x['label'] == 2).select(range(5))
for i, ex in enumerate(a5):
    print(f"例 {i+1} (label={ex['label']}):")
    print(ex["text"][:100], "...")  # 只看前 100 字
    print("-"*40)

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

例 1 (label=2):
17th CityU BandSoc呈獻，一年一度的期末演出請來日本的math-rock樂队rega及台湾的post-rock乐队8mm sky。想当年读书时期的AP来来去去都是本地姜，如今已可邀请外 ...
----------------------------------------
例 2 (label=2):
本報北京7月2日電（實習生劉言）今天，“圓中國足球夢——加多寶中國青少年足球運動員赴阿根廷培訓計劃”在京啟動，首批15名中國小球員將從北京出發，前往阿根廷博卡青年、聖洛倫佐兩傢俱樂部接受培訓。
據介紹 ...
----------------------------------------
例 3 (label=2):
導讀：很多人女性都有頭皮屑的困擾，醫生提醒，當出現頭皮屑不僅影響外觀，也暗示著頭皮新陳代謝出問題，頭屑過多影響毛髮生長，導致掉髮。 頭髮是女人的第二生命，一定要正視頭皮屑，由內而外，對症下藥，徹底讓頭 ...
----------------------------------------
例 4 (label=2):
本情趣用品網站依電腦網際網路分級辦法列為限制級，未滿18 歲謝 絕進入！
乾脆上網找情趣用品店來購買日本WINS＊SM-Castle系列《手腿連枷》桃 ，在比較了幾家情趣用品店之後
最後我決定在Sex ...
----------------------------------------
例 5 (label=2):
八月ATC交換主題：男人
看到老師們的作品真的很棒!!於是7月份報名參加了Sun Chun 的ATC活動.
這個月是第一次交作業,
當聽到這個主題時一直在想著該要如何表現?
於是心中出現的齒輪的樣子, ...
----------------------------------------


In [7]:
# 已經合併成 dataset 之後
first5 = dataset.select(range(5))
for i, ex in enumerate(first5):
    print(f"例 {i+1} (label={ex['label']}):")
    print(ex["text"][:100], "...")  # 只看前 100 字
    print("-"*40)


例 1 (label=0):
村级党组织怎么服务好搬入动迁小区的离村农民?这是越来越多"产业大村"面临的普遍问题,安亭镇林家村对此却颇有心得.
三年前,安亭镇在高压走廊,高铁,高速公路周边建设绿带生态屏障,林家村80多户住在高铁和 ...
----------------------------------------
例 2 (label=1):
我總是滿足於，這些別人眼裡微不足道，但卻在我眼裡是十分的美好的事。
今天下午有個小會議，同事在會議前出去家訪，就回來路上順手帶了小點心。
要騎車去家訪時，在辦公室附近的路邊停車格，看到紫花羊蹄甲的葉子 ...
----------------------------------------
例 3 (label=1):
無求無慾的淒慘故事
《天浴》，好一套揚威國際的台灣金馬獎「七冠佳作」，是「難得一見」，也是「難忍一見」。
純以技術層面分析，《天浴》無論在男女主角的選角、大草原上的高難度拍攝，甚至是劇本的起承轉合，皆 ...
----------------------------------------
例 4 (label=1):
看過不少國內外關於PS曲線的學習資料，總結下來之後找到了一種簡單又合理且可以用直覺和感性去理解曲線的模型，這個模型就是「三個燈泡」...
我想講個故事，好理解一點，能輕易百度到的我就不說了。
這裡只說 ...
----------------------------------------
例 5 (label=0):
[致敬劳动者]省五一劳动奖章获得者,山西农业大学生态环境产业技术研究院研究员徐明岗:以技术点土成金
2019年12月5日,联合国粮农组织在罗马总部召开一年一度的"世界土壤日"大会,将"格林卡世界土壤奖 ...
----------------------------------------


## tokenizer

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")


In [9]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = dataset.to_pandas()
# 1. 讀入資料
df_train, df_val = train_test_split(df, test_size=0.2, stratify=df_total['label'], random_state=42)

# 2. 轉換為 HuggingFace DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val)
})

# 3. Tokenize（保留欄位）
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

encoded_dataset = raw_datasets.map(tokenize, batched=True)


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## Finetune

In [10]:
labels = set(dataset["label"])
print("所有 label:", labels)
print("最大:", max(labels), "最小:", min(labels))

所有 label: {0, 1, 2}
最大: 2 最小: 0


In [11]:
# 若使用 pandas DataFrame
labels_set = set(encoded_dataset["train"]["label"])
print("所有 label:", labels_set)
assert labels_set.issubset({0, 1, 2}), "labels 有超出 0/1/2 的數值"

import numpy as np
print("有無 NaN:", np.isnan(encoded_dataset["train"]["label"]).any())
print("dtype:", type(encoded_dataset["train"]["label"][0]))
encoded_dataset = encoded_dataset.map(lambda x: {"label": int(x["label"])})

所有 label: {0, 1, 2}
有無 NaN: False
dtype: <class 'int'>


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [12]:
print("Train labels:", set(encoded_dataset["train"]["label"]))
print("Validation labels:", set(encoded_dataset["validation"]["label"]))
print("label dtype:", type(encoded_dataset["train"]["label"][0]))

Train labels: {0, 1, 2}
Validation labels: {0, 1, 2}
label dtype: <class 'int'>


In [13]:
print(any(pd.isnull(encoded_dataset["train"]["label"])))


False


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("ckiplab/bert-base-chinese", num_labels=3, hidden_dropout_prob=0.3)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./model_ckpt",
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    learning_rate=3e-5,
    save_total_limit=1,
    weight_decay=0.1,
    warmup_ratio=0.1,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33medward7777777sas[0m ([33medward7777777sas-ntut-edu-tw[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.6793
1000,0.2871
1500,0.2862
2000,0.2849
2500,0.2677
3000,0.2595
3500,0.2596
4000,0.261
4500,0.242
5000,0.2656


In [9]:
model.save_pretrained("./bert-zh-tw-classifier")
tokenizer.save_pretrained("./bert-zh-tw-classifier")

('./bert-zh-tw-classifier/tokenizer_config.json',
 './bert-zh-tw-classifier/special_tokens_map.json',
 './bert-zh-tw-classifier/vocab.txt',
 './bert-zh-tw-classifier/added_tokens.json',
 './bert-zh-tw-classifier/tokenizer.json')

In [15]:
trainer.save_model("./model_ckpt")  # 這會同時儲存 model 和 config.json
tokenizer.save_pretrained("./model_ckpt")


('./model_ckpt/tokenizer_config.json',
 './model_ckpt/special_tokens_map.json',
 './model_ckpt/vocab.txt',
 './model_ckpt/added_tokens.json',
 './model_ckpt/tokenizer.json')

In [16]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

classifier = pipeline("text-classification", model="./model_ckpt", tokenizer="./model_ckpt")
classifier("這是一篇以繁體中文撰寫的內容")


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9999579191207886}]

In [17]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.000665827770717442, 'eval_runtime': 21.4531, 'eval_samples_per_second': 466.134, 'eval_steps_per_second': 7.318, 'epoch': 5.0}


## PUSH

In [13]:
!pip install --upgrade huggingface_hub transformers
from huggingface_hub import notebook_login

notebook_login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
from transformers import AutoTokenizer

# 儲存模型與 tokenizer
model.save_pretrained("./model_ckpt")
tokenizer.save_pretrained("./model_ckpt")

# 推送到 Hugging Face
from huggingface_hub import HfApi, create_repo, upload_folder

# repo_id 建議格式："username/model-name"
repo_id = "renhehuang/bert-base-chinese-traditional-classifier"
create_repo(repo_id, private=False)  # 若要私有改成 private=True

# 將整個資料夾上傳
upload_folder(
    folder_path="./model_ckpt",
    repo_id=repo_id,
    commit_message="Initial commit"
)


Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/818M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

events.out.tfevents.1747632980.251e4669932f.366.0:   0%|          | 0.00/6.17k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/renhehuang/bert-base-chinese-traditional-classifier/commit/ce19bad8757255e8d99df2a7d581ea1e8c999929', commit_message='Initial commit', commit_description='', oid='ce19bad8757255e8d99df2a7d581ea1e8c999929', pr_url=None, repo_url=RepoUrl('https://huggingface.co/renhehuang/bert-base-chinese-traditional-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='renhehuang/bert-base-chinese-traditional-classifier'), pr_revision=None, pr_num=None)

## test

In [19]:
import random
import pandas as pd

trad_samples = [
    "這是一個繁體中文的測試句子。", "歡迎來到台灣！", "請問您需要什麼協助？",
    "我們今天學習了很多知識。", "天氣很好，適合出門散步。", "你喜歡吃什麼台灣小吃？",
    "這本書很有趣，推薦你看看。", "捷運系統非常方便。", "週末一起去看電影好嗎？", "我在圖書館讀書。"
]
simp_samples = [
    "这是一个简体中文的测试句子。", "欢迎来到中国！", "请问您需要什么帮助？",
    "我们今天学到了很多知识。", "天气很好，适合出去散步。", "你喜欢吃什么中国小吃？",
    "这本书很有趣，推荐你看看。", "地铁系统非常方便。", "周末一起去看电影好吗？", "我在图书馆学习。"
]

def random_samples(source, n):
    return [random.choice(source) for _ in range(n)]

def mixed_samples(trad_source, simp_source, n, trad_ratio):
    n_trad = int(n * trad_ratio)
    n_simp = n - n_trad
    return random_samples(trad_source, n_trad) + random_samples(simp_source, n_simp)

trad_data = random_samples(trad_samples, 50)
simp_data = random_samples(simp_samples, 50)
mix1_data = mixed_samples(trad_samples, simp_samples, 50, trad_ratio=0.7)
random.shuffle(mix1_data)
mix2_data = mixed_samples(trad_samples, simp_samples, 50, trad_ratio=0.3)
random.shuffle(mix2_data)

test_df = pd.DataFrame({
    "group": ["trad"]*50 + ["simp"]*50 + ["mix1"]*50 + ["mix2"]*50,
    "text": trad_data + simp_data + mix1_data + mix2_data
})


In [20]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./model_ckpt", tokenizer="./model_ckpt", device=0)  # CUDA
results = [classifier(t)[0] for t in test_df["text"]]
test_df["pred_label"] = [r["label"] for r in results]
test_df["score"] = [r["score"] for r in results]


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [21]:
summary = test_df.groupby("group")["pred_label"].value_counts().unstack(fill_value=0)
print("分類結果分布：")
print(summary)


分類結果分布：
pred_label  LABEL_0  LABEL_1  LABEL_2
group                                
mix1             11       38        1
mix2             21       23        6
simp             40        6        4
trad              0       50        0


In [19]:
score_stats = test_df.groupby("group")["score"].agg(["mean", "std", "min", "max"])
print("\n信心分數統計：")
print(score_stats)



信心分數統計：
           mean       std       min       max
group                                        
mix1   0.998112  0.006658  0.967402  0.999987
mix2   0.998768  0.002777  0.989678  0.999984
simp   0.994952  0.009817  0.967402  0.999984
trad   0.999973  0.000008  0.999954  0.999987
