In [1]:
# 載入模型
import os, json, torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import tqdm
import numpy as np

# 類別映射
label_map = {
    0: "賽事戰報",
    1: "球隊分析",
    2: "球員焦點",
    3: "交易與簽約",
    4: "教練與管理層",
    5: "選秀觀察",
    6: "歷史與專題"
}

# 載入 tokenizer 與模型
!unzip -q /kaggle/input/sportsv-train-with-title/basketball-bert.zip
!ls ./basketball-bert
tokenizer = BertTokenizerFast.from_pretrained("./basketball-bert")
model = BertForSequenceClassification.from_pretrained("./basketball-bert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 預測函式
def predict(text):
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoding = {k: v.to(device) for k, v in encoding.items()}
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_idx = int(np.argmax(probs))
        predicted_tag = label_map[pred_idx]
        confidence = float(probs[pred_idx])
    return predicted_tag, confidence

# ✅ 輸出資料夾
os.makedirs("./predicted_articles", exist_ok=True)

count = 0
for fname in tqdm(os.listdir("/kaggle/input/unlabeled-articles/unlabeled_articles")):
    if not fname.endswith(".json"):
        continue
    try:
        with open(f"/kaggle/input/unlabeled-articles/unlabeled_articles/{fname}", "r", encoding="utf-8") as f:
            data = json.load(f)
            content = " ".join(data.get("article-content", []))
            if not content.strip():
                continue
            predicted_tag, confidence = predict(content)
            data["predicted_category"] = predicted_tag
            data["confidence"] = round(confidence, 4)
        with open(f"./predicted_articles/{fname}", "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        count += 1
    except Exception as e:
        print(f"❌ {fname} 發生錯誤：{e}")
print(f"✅ 預測完成，共處理 {count} 篇")

# ✅ 壓縮輸出資料夾供下載
!zip -qr predicted_articles.zip predicted_articles

2025-06-03 14:00:35.268867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748959235.466302      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748959235.523466      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


100%|██████████| 2481/2481 [01:10<00:00, 35.37it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ 預測完成，共處理 2481 篇
