# 抓三個種類新聞

In [51]:
import requests
import re
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta
from fake_useragent import UserAgent
from urllib.parse import urljoin


In [52]:
news_links =['index.php?k=1', 'index.php?k=5', 'index.php?k=13','index.php?k=11']
news_categories=['PC','動漫畫','電競','活動展覽']
base_url = 'https://gnn.gamer.com.tw/'


In [53]:
user_agent = UserAgent()
user_agent.random

'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.1781.1915 Mobile Safari/537.36'

# 存放資料之變數

In [54]:
links = []
titles = []
dates = []
contents = []
categories = []
item_id = []
photo_links = []

In [55]:

# 設定一個變數來存儲上一條新聞的日期（初始化為今天的日期）
last_valid_date = datetime.now().strftime("%Y-%m-%d")

# 爬取每個類別的新聞
for i, url_short_name in enumerate(news_links):
    category = news_categories[i]
    category_url = urljoin(base_url, url_short_name)
    print("Getting categorical news:", category)
    print(category_url)

    # 發送請求
    req = requests.get(category_url, headers={"User-Agent": user_agent.random}, timeout=5)
    page = BeautifulSoup(req.text, 'html.parser')

    # 找到新聞區塊
    news_items = page.find_all('div', class_='GN-lbox2B')

    serial_no = 1  # 用來計數新聞數量
    for item in news_items:
        # 抓取標題
        title_tag = item.find('h1', class_='GN-lbox2D')
        title = title_tag.text.strip() if title_tag else "無標題"

        # 抓取新聞連結
        link_tag = item.find('a', href=True)
        link = urljoin(base_url, link_tag['href']) if link_tag else "無連結"

        print(serial_no, '--', title)
        print(link)
        
        # 抓取圖片網址
        image_tag = item.find('img', src=True)
        image_url = image_tag['src'] if image_tag else "無圖片"
        print(image_url)
        

        # 加入資料
        categories.append(category)
        titles.append(title)
        links.append(link)
        photo_links.append(image_url)

        # 爬取內頁新聞
        req = requests.get(link, headers={"User-Agent": user_agent.random}, timeout=5)
        page = BeautifulSoup(req.text, 'html.parser')

        # 抓取新聞時間
        time_element = page.find('span', class_='GN-lbox3C')

        if time_element:
            text_content = time_element.text.strip()
            parts = text_content.split()
            try:
                news_time = datetime.strptime(parts[-2] + " " + parts[-1], "%Y-%m-%d %H:%M:%S")
                news_date = news_time.strftime("%Y-%m-%d")
                last_valid_date = news_date  # 更新最新的有效日期
            except Exception as e:
                print(f"日期格式錯誤: {e}，使用上一次的日期 {last_valid_date}")
                news_date = last_valid_date  # 使用上一條新聞的日期
        else:
            print(f"未找到日期，使用上一次的日期 {last_valid_date}")
            news_date = last_valid_date  # 使用上一條新聞的日期

        dates.append(news_date)

        # 產生唯一 ID
        item_id.append(url_short_name + "_" + news_date + "_" + str(serial_no))

        # 抓取內容
        filtered_news = [p.get_text(strip=True) for p in page.select(".GN-lbox3B") if p.get_text(strip=True)]
        contents.append("\n".join(filtered_news))  # 轉換成字串，每段用換行符號分隔


        # 限制每個類別最多 4 則新聞
        if serial_no >= 100:
            break

        serial_no += 1  # 遞增計數

Getting categorical news: PC
https://gnn.gamer.com.tw/index.php?k=1
1 -- 多平台
《餓狼傳說 City of the Wolves》足球界傳奇巨星「克里斯蒂亞諾‧羅納度」確定參戰！
 36 人推！
https://gnn.gamer.com.tw/detail.php?sn=282906
https://p2.bahamut.com.tw/B/2KU/58/2fc0d3915aa52d8a260a2df8b41tlru5.WEBP
2 -- 多平台
NEXON 全新 MMORPG《瑪奇 Mobile》於韓國正式上線 以邂逅與冒險為核心的全新體驗
 34 人推！
https://gnn.gamer.com.tw/detail.php?sn=282892
https://p2.bahamut.com.tw/B/2KU/50/416dbc27ab17668a4e1f738b231tlm25.JPG
3 -- 多平台
《維納斯璀璨假期》試玩報導 以最新引擎描繪出美麗女神 感受系列全新可能性
 36 人推！
https://gnn.gamer.com.tw/detail.php?sn=282937
https://p2.bahamut.com.tw/B/2KU/65/2be079bf02216d72f7e9b23f7d1tlxl5.JPG
日期格式錯誤: time data '16:38:13 原文出處' does not match format '%Y-%m-%d %H:%M:%S'，使用上一次的日期 2025-03-27
4 -- 多平台
《塊魂》創作者高橋慶太新作《to a T》試玩體驗與專訪 描寫難過卻又帶點溫馨的少年生活
 0 人推！
https://gnn.gamer.com.tw/detail.php?sn=282958
https://p2.bahamut.com.tw/B/2KU/06/dedabe5829e50445a1824f76b01tm4a5.JPG
日期格式錯誤: time data '18:23:36 原文出處' does not match format '%Y-%m-%d %H:%M:%S'，使用上一次的日期 2025-03-27
5 -- 多平台
《少女魔役》下載版今

# Save data

In [56]:
data = zip(item_id, dates, categories, titles, contents, links, photo_links)
df = pd.DataFrame(list(data), columns=['item_id','date','category','title','content','link','photo_link'])
df.head(2)
df.shape
df.content[0]
df.to_csv("cna_category_news.csv", sep="|", index=False)

# tokenize news and save

In [61]:
import pandas as pd
import numpy as np
from collections import Counter
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

# 讀取 CSV 檔案
df = pd.read_csv('cna_category_news.csv', sep='|')

# 檢查並處理 NaN 值
df['content'] = df['content'].fillna('')  # 將 NaN 填充為空字串

# 初始化 Ckip 斷詞工具（使用較小的 albert-tiny 模型提升速度）
ws = CkipWordSegmenter(model="albert-tiny")
pos = CkipPosTagger(model="albert-tiny")
ner = CkipNerChunker(model="albert-tiny")

# 進行斷詞、詞性標註與命名實體辨識
tokens = ws(df['content'].tolist())
tokens_pos = pos(tokens)
entity_list = ner(df['content'].tolist())

# 建立詞性標註配對
token_pos_pairs = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

# 過濾條件（特定詞性）
allowPOS = ['Na', 'Nb', 'Nc', 'VC']

tokens_v2 = []
for wp in token_pos_pairs:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

# 新增斷詞結果到 DataFrame
df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = token_pos_pairs

# 計算詞頻函式
def word_frequency(wp_pair):
    filtered_words = [word for word, pos in wp_pair if (pos in allowPOS) and (len(word) >= 2)]
    return Counter(filtered_words).most_common(200)

# 計算每篇文章的關鍵詞頻率
keyfreqs = [word_frequency(wp) for wp in token_pos_pairs]
df['top_key_freq'] = keyfreqs

# 預設摘要與情緒分析欄位
df['summary'] = "暫無"
df['sentiment'] = "暫無"

# 調整欄位順序
df = df[['item_id', 'date', 'category', 'title', 'content', 'sentiment', 'summary',
         'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos', 'link', 'photo_link']]

# 儲存預處理後的資料
df.to_csv('cna_news_preprocessed.csv', sep='|', index=False)

print("Tokenize OK!")



Tokenization:   0%|          | 0/391 [07:32<?, ?it/s]

[A
[A
Tokenization: 100%|██████████| 391/391 [00:00<00:00, 518.86it/s]
Inference: 100%|██████████| 4/4 [00:17<00:00,  4.28s/it]
Tokenization: 100%|██████████| 391/391 [00:00<00:00, 1426.70it/s]
Inference: 100%|██████████| 66/66 [05:27<00:00,  4.96s/it]
Tokenization: 100%|██████████| 391/391 [00:00<00:00, 620.33it/s] 
Inference: 100%|██████████| 4/4 [00:16<00:00,  4.09s/it]


Tokenize OK!


# Count top keywords

In [59]:
from collections import Counter
import pandas as pd

# 
df = pd.read_csv('cna_news_preprocessed.csv',sep='|')
news_categories=['PC','動漫畫','電競','活動展覽']
# Filter condition: two words and specified POS
# 過濾條件:兩個字以上 特定的詞性
allowedPOS=['Na','Nb','Nc']

# 
# get topk keyword function
def get_top_words():
    top_cate_words={} # final result
    counter_all = Counter() # counter for category '全部'
    for category in news_categories:

        df_group = df[df.category == category]

        # concatenate all filtered words in the same category
        words_group = []
        for row in df_group.token_pos:

            # filter words for each news
            filtered_words =[]
            for (word, pos) in eval(row):
                if (len(word) >= 2) & (pos in allowedPOS):
                    filtered_words.append(word)

            # concatenate filtered words  
            words_group += filtered_words

        # now we can count word frequency
        counter = Counter( words_group )

        # counter 
        counter_all += counter
        topwords = counter.most_common(100)

        # store topwords
        top_cate_words[category]= topwords

    # Process category '全部'
    top_cate_words['全部'] = counter_all.most_common(100)
    
    # To conveniently save data using pandas, we should convert dict to list.
    return list(top_cate_words.items())

# Save top 200 word frequency for each category
top_group_words = get_top_words()
df_top_group_words = pd.DataFrame(top_group_words, columns = ['category','top_keys'])
df_top_group_words.to_csv('cna_news_topkey_with_category_via_token_pos.csv', index=False)