<a href="https://colab.research.google.com/github/Fis428/PC03/blob/main/EX04_06_ET_Today_%E6%96%B0%E8%81%9E%E6%A8%99%E9%A1%8C%E5%88%86%E9%A1%9E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. 載入套件 & 讀取資料
import pandas as pd

url = 'https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-ettoday_news.csv'
df  = pd.read_csv(url)
print(df.head(), '\n')

# 2. 下載字典與停用詞 (Notebook 可用)
!wget -O dict.txt      https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt
!wget -O stopwords.txt https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt

# 3. 中文斷詞 + 去停用詞
import jieba
jieba.set_dictionary('dict.txt')

def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f if line.strip())

stopwords = load_stopwords('stopwords.txt')

def jieba_cut_with_stopwords(text, stopwords):
    return [w for w in jieba.cut(text)
            if w.strip() and w not in stopwords]

# 把「標題」欄做斷詞並用空格串起來
df['標題'] = df['標題'].apply(
    lambda x: ' '.join(jieba_cut_with_stopwords(x, stopwords))
)
print(df['標題'].head(), '\n')

# 4. 切分資料
from sklearn.model_selection import train_test_split
X = df['標題']
y = df['類別']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. 類別編碼
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)

# 6. TF–IDF 特徵化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

# 7. 模型訓練 & 評估
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics       import accuracy_score, classification_report

mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train_tfidf, y_train_enc)

train_score = mnb.score(X_train_tfidf, y_train_enc)
print(f"Training accuracy: {train_score:.4f}")

y_pred = mnb.predict(X_test_tfidf)
print("Test  accuracy:", accuracy_score(y_test_enc, y_pred))
print("Classification report:\n", classification_report(y_test_enc, y_pred))

# 8. 範例推論
text = '桌球／踢館？嗆誰管選舉一面之詞翻盤滾球  選訝委員：勿用政治手段處理'
cut  = jieba_cut_with_stopwords(text, stopwords)
s    = vectorizer.transform([' '.join(cut)])
pred = mnb.predict(s)[0]
print("Predicted label index:", pred)
print("Predicted category:", label_encoder.classes_[pred])


   類別                                   標題
0  政治  青年座談不被AI取代的關鍵力　葉丙成：只有這兩種人才能真正把握時間紅利
1  政治               蕭美琴直播初體驗曝「咪琴嚴選」　邀網友挺花蓮
2  政治                 疑似國軍「在美受訓」照曝光！　陸軍不評論
3  政治      「蔡英文、柯文哲有談NCC」　府：沒談人選、政黨比例更沒私下喬
4  政治      總統府否認蔡英文向柯喬NCC委員　黃國昌嗆：醜聞被揭露還敢說謊 

--2025-06-16 02:17:54--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4154480 (4.0M) [application/octet-stream]
Saving to: ‘dict.txt’


2025-06-16 02:17:55 (39.6 MB/s) - ‘dict.txt’ saved [4154480/4154480]

--2025-06-16 02:17:55--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.

Building prefix dict from /content/dict.txt ...
DEBUG:jieba:Building prefix dict from /content/dict.txt ...
Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
Loading model cost 2.945 seconds.
DEBUG:jieba:Loading model cost 2.945 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


0    青年 座談 AI 取代 關鍵 力 葉丙成 這兩種 人才 真正 把握 時間 紅利
1                蕭美琴 直播 體驗 曝 咪琴 嚴選 邀 網友 挺 花蓮
2                        疑似 國軍 美 受訓 曝光 陸軍 評論
3          蔡英文 柯文哲 有談 NCC 府 沒談 人選 政黨 比例 私下 喬
4     總統府 否認 蔡英文 柯喬 NCC 委員 黃國昌 嗆 醜聞 揭露 還敢 說謊
Name: 標題, dtype: object 

