In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# 確認是否有可用的GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')


Using device: cuda


In [2]:
# 讀取CSV資料，假設包含 "text" 與 "label" 欄位
df = pd.read_csv("referance_newCat.csv")

# 假設9個分類標籤是以下（請根據實際情況調整）
categories = [
    "人身攻擊與侮辱性言論",
    "質疑身份與背景",
    "存在感與關注度評論",
    "政治立場批評",
    "政治立場表態",
    "政治陰謀論與指控",
    "政績與能力質疑",
    "造謠與誠信質疑",
    "反指標與諷刺預測",
    "罷免相關評論",
    "違建農舍議題",
    "疫情與疫苗相關評論",
    "疑似機器人或複製貼上留言",
    "幽默與嘲諷",
    "簡短情緒表達"
]

# 將文字標籤轉成數字ID
label2id = {cat: idx for idx, cat in enumerate(categories)}
id2label = {idx: cat for cat, idx in label2id.items()}

df['label_id'] = df['label'].apply(lambda x: label2id[x])  # 將文字標籤轉數字


In [35]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)

print("訓練筆數：", len(train_df))
print("測試筆數：", len(test_df))


訓練筆數： 352
測試筆數： 88


In [36]:
model_name = "hfl/chinese-roberta-wwm-ext-large"  # 使用更大的預訓練模型
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 將文字資料轉成 BERT 相容的輸入格式
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="longest",     # 也可以改成 longest，若想動態padding可使用 batched 方式
        truncation=True,
        max_length=512
        )


In [37]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']])
test_dataset  = Dataset.from_pandas(test_df[['text', 'label_id']])

# 進行 Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 移除多餘欄位，並指定特徵名稱
train_dataset = train_dataset.remove_columns(["text"])
train_dataset = train_dataset.rename_column("label_id", "labels")
train_dataset.set_format("torch")

test_dataset = test_dataset.remove_columns(["text"])
test_dataset = test_dataset.rename_column("label_id", "labels")
test_dataset.set_format("torch")

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})


Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

In [8]:
num_labels = 15  # 分類數量
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# 將模型移動到GPU
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [39]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",       # 每個 epoch 結束後做評估
    save_strategy="epoch",            # 每個 epoch 儲存一次模型 (可根據需求調整)
    num_train_epochs=5,               # 以資料量少為前提，先試試 5 epoch
    learning_rate=2e-5,
    per_device_train_batch_size=3,    # 視硬體調整
    per_device_eval_batch_size=3,
    logging_dir="./logs",             # 訓練過程紀錄
    logging_steps=10,
    load_best_model_at_end=True,       # 在最後載入最好的模型權重
    report_to="none"                  # 禁用報告到任何平台
)





In [40]:
import evaluate

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": acc["accuracy"]}


In [41]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [42]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

# 取得最佳模型存放的 checkpoint 路徑
best_checkpoint_dir = trainer.state.best_model_checkpoint
print("Best model checkpoint folder:", best_checkpoint_dir)

# 可以把最佳模型另存到某個資料夾，例如 "./best_model"
best_model_dir = "./best_model"
trainer.save_model(best_model_dir)
print(f"Best model is now saved to: {best_model_dir}")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6215,0.61752,0.795455
2,0.1093,0.305618,0.875
3,0.1027,0.200676,0.909091
4,0.5059,0.239079,0.897727


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "磁碟的空間不足。" })

In [3]:
best_model_dir = "./best_model"
model_name = "hfl/chinese-roberta-wwm-ext-large"
model = AutoModelForSequenceClassification.from_pretrained(best_model_dir)
tokenizer = AutoTokenizer.from_pretrained(best_model_dir)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [4]:
import pandas as pd

all_list = []

year = '2022'
#input_json_path = f"C://Users//andyw//Desktop//桑鑼的分類//fb_comments//comments//result_{i}.json"
input_csv_path = f"C://Users//andyw//Desktop//AIComments//AIClassify//comments_{year}.csv"

df = pd.read_csv(input_csv_path, header=None, names=["貼文時間","內容", "時間"])

#all_list

In [6]:

import pandas as pd
import torch
from tqdm import tqdm
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

all_list = []
years = ['2020', '2019']

for year in years:
    #input_json_path = f"C://Users//andyw//Desktop//桑鑼的分類//fb_comments//comments//result_{i}.json"
    input_csv_path = f"C://Users//andyw//Desktop//AIComments//AIClassify//comments_{year}.csv"

    df = pd.read_csv(input_csv_path, header=None, names=["貼文時間","內容", "時間"])

    #all_list

    

    # (假設你已經先行載入完模型和 tokenizer)
    # model_name = "你的模型路徑或名稱"
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    # model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # 假設 test_texts 是一個包含「內容」欄位的 DataFrame
    # e.g. test_texts = pd.read_excel("some_excel_file.xlsx")
    test_texts = df
    test_texts['內容'] = test_texts['內容'].fillna("").astype(str)
    input_sentences = test_texts['內容'].tolist()


    # 設定批次大小
    batch_size = 16

    all_predictions = []

    # 分批進行預測，並在迴圈外包上 tqdm 以顯示進度
    for i in tqdm(range(0, len(input_sentences), batch_size), desc="Predicting"):
        batch_texts = input_sentences[i : i + batch_size]

        # Tokenizer
        encoding = tokenizer(
            batch_texts,
            padding="longest",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy().tolist()

        # 收集每一批的預測結果
        all_predictions.extend(batch_preds)

    # 將數字預測結果轉成文字標籤
    predicted_labels = [id2label[p] for p in all_predictions]

    # 將結果寫回原本 DataFrame
    test_texts['分類'] = predicted_labels

    # (可選) 檢查預測結果
    for text, label in zip(test_texts['內容'][:5], test_texts['分類'][:5]):  # 範例只印前 5 筆
        print(f"文本: {text}\n -> 預測分類: {label}\n")

    # 儲存結果到 Excel 檔
    output_path = f"C://Users//andyw//Desktop//AIComments//AIClassify//resultComment//result{year}.xlsx"
    test_texts.to_excel(output_path, index=False)
    print(f"已將結果儲存至：{output_path}")


Predicting: 100%|██████████| 4190/4190 [19:03<00:00,  3.66it/s]


文本: 中壢人素質要先栽培，不然罷免王後，再選出來的還是一樣
 -> 預測分類: 反指標與諷刺預測

文本: 先追究豪華違建失職單位，甭讓牠太好過！
 -> 預測分類: 罷免相關評論

文本: 我是桃園的，雖沒有投票權，但是精神上支持中壢人順利罷免掉浪費納稅人付薪水的議員
 -> 預測分類: 反指標與諷刺預測

文本: 0116中壢人站出來，請投下贊成票，給浩宇一個重生的機會，中壢人寫歷史，請各位幫忙浩宇拉票，非王不投!!!
 -> 預測分類: 疑似機器人或複製貼上留言

文本: 有議員身分在，桃園市府拆除大隊不敢拆啦！罷王救農地吧！
 -> 預測分類: 罷免相關評論

已將結果儲存至：C://Users//andyw//Desktop//AIComments//AIClassify//resultComment//result2020.xlsx


Predicting: 100%|██████████| 704/704 [03:12<00:00,  3.65it/s]


文本: 到底罷免連署開始了沒？這人越看越討厭⋯⋯
 -> 預測分類: 反指標與諷刺預測

文本: 大家好，我是綠黨秘書長張竹芩，浩宇已經退黨一陣子了，綠黨努力在環保和社會正義議題上耕耘，歡迎大家來關注！#綠黨秘書長路過
 -> 預測分類: 疑似機器人或複製貼上留言

文本: 就是阿！ 趴著皇太后的腿 不是太監哪是什麼？？自己的頭銜就已證明一切****桃園市民進黨競選總部副幹事*****
 -> 預測分類: 違建農舍議題

文本: 被說太監 感覺他好像引以為榮耶？？
 -> 預測分類: 質疑身份與背景

文本: 其實是奄人
 -> 預測分類: 人身攻擊與侮辱性言論

已將結果儲存至：C://Users//andyw//Desktop//AIComments//AIClassify//resultComment//result2019.xlsx


In [None]:
#testtest


# 將數字預測結果轉成文字標籤
predicted_labels = [id2label[p] for p in all_predictions]

# 將結果寫回原本 DataFrame
test_texts['分類'] = predicted_labels

# (可選) 檢查預測結果
for text, label in zip(test_texts['內容'][:5], test_texts['分類'][:5]):  # 範例只印前 5 筆
    print(f"文本: {text}\n -> 預測分類: {label}\n")

# 儲存結果到 Excel 檔
output_path = "C://Users//andyw//Desktop//AIComments//AIClassify//resultComment//result2023.xlsx"
test_texts.to_excel(output_path, index=False)
print(f"已將結果儲存至：{output_path}")

文本: 熟悉又刺耳的聲音
 -> 預測分類: 人身攻擊與侮辱性言論

文本: 壞人不會變好只會變醜
 -> 預測分類: 幽默與嘲諷

文本: 有句話說要知道狗是改不了吃某些食物的
 -> 預測分類: 質疑身份與背景

文本: 你要確定耶~XD
 -> 預測分類: 簡短情緒表達

文本: 冤家宜解不宜結
 -> 預測分類: 幽默與嘲諷

已將結果儲存至：C://Users//andyw//Desktop//AIComments//AIClassify//resultComment//result2023.xlsx
