In [7]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import logging
from urllib.parse import urlencode

# 設置日誌
logging.basicConfig(
    filename='mops_scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

def setup_session():
    """建立並返回一個設定好的Session物件。"""
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
        ),
        'Referer': 'https://mops.twse.com.tw/mops/web/t51sb10'
    }
    session = requests.Session()
    session.headers.update(headers)
    return session

def build_initial_url(co_id, search_keyword, year, month, begin_day, end_day):
    """根據搜尋參數建立初始查詢的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t51sb10"
    params = {
        'encodeURIComponent': '1',
        'step': '1',
        'firstin': 'true',
        'Stp': '4',
        'go': 'false',
        'r1': '2',
        'co_id': co_id,
        'KIND': 'L',
        'keyWord': '自結',
        'Condition2': '1',
        'year': year,
        'month1': month,
        'begin_day': begin_day,
        'end_day': end_day
    }
    return f"{base_url}?{urlencode(params)}"

def create_download_dir(directory_name):
    """創建單一的下載資料夾，並以特定名稱命名。"""
    directory = directory_name
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

def extract_post_parameters(soup):
    """從初始HTML中提取詳細資料的POST參數。"""
    regex = re.compile(r'document\.fm\.(\w+)\.value="([^"]+)"')
    post_buttons = soup.find_all('input', {'type': 'button', 'onclick': True})
    extracted_params = []

    for button in post_buttons:
        matches = regex.findall(button.get('onclick', ''))
        if matches:
            params_dict = {param: value for param, value in matches}
            required_keys = ['seq_no', 'spoke_time', 'spoke_date', 'i', 'co_id', 'TYPEK']
            if all(key in params_dict for key in required_keys):
                extracted_params.append(params_dict)

    logger.info(f"成功提取到 {len(extracted_params)} 筆POST參數")
    return extracted_params

def build_detail_url(params_dict):
    """根據提取的參數建立詳細資料的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t05st01"
    query_params = {
        'step': '2',
        'colorchg': '1',
        'off': '1',
        'firstin': '1'
    }
    query_params.update(params_dict)
    return f"{base_url}?{urlencode(query_params)}"

def save_html(content, directory, stock_code, announce_date):
    """將HTML內容保存為公司資料夾中的文件。"""
    filename = f"{stock_code}_detail_{announce_date}.html"
    file_path = os.path.join(directory, filename)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)
    logger.info(f"成功儲存HTML內容到 {file_path}")

def fetch_detail_page(session, detail_url):
    """發送POST請求並返回詳細頁面的HTML內容。"""
    try:
        response = session.post(detail_url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"發送POST請求失敗: {e}")
        return None

def main():
    # 設定所有要查詢的公司代碼
    co_ids = ['2880', '2881', '2882', '2883', '2884', '2885', 
              '2886', '2887', '2888', '2889', '2890', '2891', 
              '2892', '5880']
    
    # 重大資訊查詢設定
    search_key_word = "自結"     
    search_year = '113'
    search_month = '1'
    search_begin_day = '11'
    search_end_day = '12'

    session = setup_session()
    
    # 創建單一的下載資料夾
    download_dir = create_download_dir('202409')

    for co_id in co_ids:
        logger.info(f"開始處理公司代碼: {co_id}")
        print(f"目前處理公司代碼: {co_id}")

        # 根據公司代碼構建初始查詢的URL
        initial_url = build_initial_url(
            co_id=co_id,
            search_keyword=search_key_word,
            year=search_year,
            month=search_month,
            begin_day=search_begin_day,
            end_day=search_end_day
        )

        try:
            response_initial = session.get(initial_url)
            response_initial.raise_for_status()
            logger.info(f"成功下載 {co_id} 的初始頁面")
        except requests.RequestException as e:
            logger.error(f"下載初始頁面失敗（{co_id}）: {e}")
            continue

        soup_initial = BeautifulSoup(response_initial.text, 'html.parser')
        post_params_list = extract_post_parameters(soup_initial)

        if not post_params_list:
            logger.error(f"{co_id} 未提取到任何有效的POST參數，跳過此公司")
            continue

        for idx, params_dict in enumerate(post_params_list, start=1):
            logger.info(f"處理第 {idx} 筆資料: {params_dict['co_id']}")
            print(f"目前進度: {idx} / {len(post_params_list)}")

            detail_url = build_detail_url(params_dict)
            html_detail = fetch_detail_page(session, detail_url)

            if not html_detail:
                logger.warning(f"無法下載詳細頁面，跳過: {params_dict['co_id']}")
                continue

            # 使用固定的下載資料夾
            announce_date = params_dict['spoke_date']
            save_html(html_detail, download_dir, params_dict['co_id'], announce_date)

            time.sleep(1)  # 延遲避免被封鎖

    logger.info("所有公司資料下載完成")

if __name__ == "__main__":
    main()

目前處理公司代碼: 2880
目前進度: 1 / 1
目前處理公司代碼: 2881
目前進度: 1 / 1
目前處理公司代碼: 2882
目前進度: 1 / 1
目前處理公司代碼: 2883
目前進度: 1 / 1
目前處理公司代碼: 2884
目前進度: 1 / 1
目前處理公司代碼: 2885
目前進度: 1 / 1
目前處理公司代碼: 2886
目前進度: 1 / 1
目前處理公司代碼: 2887
目前進度: 1 / 1
目前處理公司代碼: 2888
目前進度: 1 / 1
目前處理公司代碼: 2889
目前進度: 1 / 1
目前處理公司代碼: 2890
目前進度: 1 / 1
目前處理公司代碼: 2891
目前進度: 1 / 1
目前處理公司代碼: 2892
目前進度: 1 / 1
目前處理公司代碼: 5880
目前進度: 1 / 1
