# 抓三個種類新聞

In [45]:
import requests
import re
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta
from fake_useragent import UserAgent
from urllib.parse import urljoin


In [46]:
news_links =['index.php?k=1', 'index.php?k=5', 'index.php?k=13','index.php?k=11']
news_categories=['PC','動漫畫','電競','活動展覽']
base_url = 'https://gnn.gamer.com.tw/'


In [47]:
user_agent = UserAgent()
user_agent.random

'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Mobile/15E148 Safari/604.1'

# 存放資料之變數

In [48]:
links = []
titles = []
dates = []
contents = []
categories = []
item_id = []
photo_links = []

In [49]:
for i, url_short_name in enumerate(news_links):
    category = news_categories[i]
    category_url = base_url + url_short_name
    print("Getting categorical news:", category)
    print(category_url)

    # Request the page
    req = requests.get(category_url, headers={"User-Agent": user_agent.random}, timeout=5)
    page = BeautifulSoup(req.text, 'html.parser')

    # 抓取新聞區塊
    news_items = page.find_all('div', class_='GN-lbox2B')

    serial_no = 1  # 用來計數新聞數量
    for item in news_items:
        # 抓取標題
        title_tag = item.find('h1', class_='GN-lbox2D')
        title = title_tag.text.strip() if title_tag else "無標題"

        # 抓取新聞連結
        link_tag = item.find('a', href=True)
        link = urljoin(base_url, link_tag['href']) if link_tag else "無連結"

        print(serial_no, '--', title)
        print(link)

        # 加入資料
        categories.append(category)
        titles.append(title)
        links.append(link)

        # 爬取內頁
        req = requests.get(link, headers={"User-Agent": user_agent.random}, timeout=5)
        page = BeautifulSoup(req.text, 'html.parser')

        # 抓取新聞時間
        time_element = page.find('span', class_='GN-lbox3C')

        if time_element:
            text_content = time_element.text.strip()
            parts = text_content.split()
            try:
                news_time = datetime.strptime(parts[-2] + " " + parts[-1], "%Y-%m-%d %H:%M:%S")
                news_date = news_time.strftime("%Y-%m-%d")
            except Exception as e:
                print(f"日期格式錯誤: {e}")
                news_date = "未知"
        else:
            news_date = "未知"

        dates.append(news_date)

        # 產生唯一 ID
        item_id.append(url_short_name + "_" + news_date + "_" + str(serial_no))

        # 抓取內容
        filtered_news = [p.get_text(strip=True) for p in page.select(".GN-lbox3B") if p.get_text(strip=True)]
        contents.append(filtered_news)

        # **修正這裡的條件**
        if serial_no >= 4:  # 用 `serial_no` 來限制數量，而不是 `item`
            break

        serial_no += 1  # 記得遞增計數

Getting categorical news: PC
https://gnn.gamer.com.tw/index.php?k=1
1 -- 多平台
網石公開全新開放世界收集型 RPG《七大罪：起源》官方預告網站
 9 人推！
https://gnn.gamer.com.tw/detail.php?sn=282748
2 -- 多平台
《新 VR 快打專案》製作人台灣獨家專訪 打造讓老粉絲與新玩家都覺得超厲害的全新作品
 40 人推！
https://gnn.gamer.com.tw/detail.php?sn=282736
3 -- 多平台
經典電影改編好評動作遊戲《印第安納瓊斯：古老之圈》PS5 版確定 4/17 推出
 4 人推！
https://gnn.gamer.com.tw/detail.php?sn=282750
4 -- PC
前 Blizzard 創辦人暨總裁 Mike Morhaime 的新公司 Dreamhaven 將於 26 日凌晨公開新作
 1 人推！
https://gnn.gamer.com.tw/detail.php?sn=282746
Getting categorical news: 動漫畫
https://gnn.gamer.com.tw/index.php?k=5
1 -- 多平台
網石公開全新開放世界收集型 RPG《七大罪：起源》官方預告網站
 9 人推！
https://gnn.gamer.com.tw/detail.php?sn=282748
2 -- 動漫
新作劇場版《魔法少女小圓 瓦爾普吉斯之迴天》釋出首波視覺圖！
 4 人推！
https://gnn.gamer.com.tw/detail.php?sn=282765
3 -- 手機
《【我推的孩子】》官方益智類手機遊戲公開前導網站 確定於全球推出
 4 人推！
https://gnn.gamer.com.tw/detail.php?sn=282739
4 -- 其他
「吉伊卡哇燒」常設店鋪 4/7 起橫濱開幕 推出可愛甜點與原創周邊商品
 2 人推！
https://gnn.gamer.com.tw/detail.php?sn=282738
Getting categorical news: 電競
https://gnn.gamer.com.tw/index.

# Save data

In [50]:
data = zip(item_id, dates, categories, titles, contents, links)
df = pd.DataFrame(list(data), columns=['item_id','date','category','title','content','link'])
df.head(2)
df.shape
df.content[0]
df.to_csv("cna_category_news.csv", sep="|", index=False)

# tokenize news and save

In [51]:
%%time
import pandas as pd
import numpy
from collections import Counter
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

df = pd.read_csv('cna_category_news.csv', sep='|')

# ckiplab word segment (中研院斷詞)
# Initialize drivers
# It takes time to download ckiplab models

# default參數是model="bert-base"
# ws = CkipWordSegmenter() 
# pos = CkipPosTagger()
# ner = CkipNerChunker()

# model="albert-tiny" 模型小，斷詞速度比較快，犧牲一些精確度
ws = CkipWordSegmenter(model="albert-tiny") 
pos = CkipPosTagger(model="albert-tiny")
ner = CkipNerChunker(model="albert-tiny")


## Word Segmentation
tokens = ws(df.content)

## POS
tokens_pos = pos(tokens)

## word pos pair 詞性關鍵字
word_pos_pair = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

## NER命名實體辨識
entity_list = ner(df.content)

# Remove stop words and filter using POS tag (tokens_v2)
#with open('stops_chinese_traditional.txt', 'r', encoding='utf8') as f:
#    stops = f.read().split('\n')

# 過濾條件:兩個字以上 特定的詞性
# allowPOS 過濾條件: 特定的詞性
allowPOS = ['Na', 'Nb', 'Nc', 'VC']

tokens_v2 = []
for wp in word_pos_pair:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

# Insert tokens into dataframe (新增斷詞資料欄位)
df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = word_pos_pair

# Calculate word count (frequency) 計算字頻(次數)


def word_frequency(wp_pair):
    filtered_words = []
    for word, pos in wp_pair:
        if (pos in allowPOS) & (len(word) >= 2):
            filtered_words.append(word)
        #print('%s %s' % (word, pos))
    counter = Counter(filtered_words)
    return counter.most_common(200)


keyfreqs = []
for wp in word_pos_pair:
    topwords = word_frequency(wp)
    keyfreqs.append(topwords)

df['top_key_freq'] = keyfreqs

# Abstract (summary) and sentimental score(摘要與情緒分數)
summary = []
sentiment = []
for text in df.content:
    summary.append("暫無")
    sentiment.append("暫無")

df['summary'] = summary
df['sentiment'] = sentiment

# Rearrange the colmun order for readability
df = df[[
    'item_id', 'date','category', 'title', 'content', 'sentiment', 'summary',
    'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos', 'link',
    
]]

# Save data to disk
df.to_csv('cna_news_preprocessed.csv', sep='|', index=False)

## Read it out 讀出看看
#df = pd.read_csv('cna_dataset_preprocessed.csv', sep='|')
#df.head(1)

print("Tokenize OK!")

Tokenization: 100%|██████████| 16/16 [00:00<00:00, 530.58it/s]
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
Tokenization: 100%|██████████| 16/16 [00:00<00:00, 388.29it/s]
Inference: 100%|██████████| 4/4 [00:20<00:00,  5.13s/it]
Tokenization: 100%|██████████| 16/16 [00:00<00:00, 386.43it/s]
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]

Tokenize OK!
CPU times: total: 2min 48s
Wall time: 26.3 s





# Count top keywords

In [52]:
from collections import Counter
import pandas as pd

# 
df = pd.read_csv('cna_news_preprocessed.csv',sep='|')
news_categories=['PC','動漫畫','電競','活動展覽']
# Filter condition: two words and specified POS
# 過濾條件:兩個字以上 特定的詞性
allowedPOS=['Na','Nb','Nc']

# 
# get topk keyword function
def get_top_words():
    top_cate_words={} # final result
    counter_all = Counter() # counter for category '全部'
    for category in news_categories:

        df_group = df[df.category == category]

        # concatenate all filtered words in the same category
        words_group = []
        for row in df_group.token_pos:

            # filter words for each news
            filtered_words =[]
            for (word, pos) in eval(row):
                if (len(word) >= 2) & (pos in allowedPOS):
                    filtered_words.append(word)

            # concatenate filtered words  
            words_group += filtered_words

        # now we can count word frequency
        counter = Counter( words_group )

        # counter 
        counter_all += counter
        topwords = counter.most_common(100)

        # store topwords
        top_cate_words[category]= topwords

    # Process category '全部'
    top_cate_words['全部'] = counter_all.most_common(100)
    
    # To conveniently save data using pandas, we should convert dict to list.
    return list(top_cate_words.items())

# Save top 200 word frequency for each category
top_group_words = get_top_words()
df_top_group_words = pd.DataFrame(top_group_words, columns = ['category','top_keys'])
df_top_group_words.to_csv('cna_news_topkey_with_category_via_token_pos.csv', index=False)