In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import logging
from urllib.parse import urlencode

# 設置日誌
logging.basicConfig(
    filename='mops_scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

def setup_session():
    """建立並返回一個設定好的Session物件。"""
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
        ),
        'Referer': 'https://mops.twse.com.tw/mops/web/t51sb10'
    }
    session = requests.Session()
    session.headers.update(headers)
    return session

def build_initial_url(co_id, search_keyword, year, month, begin_day, end_day):
    """根據搜尋參數建立初始查詢的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t51sb10"
    params = {
        'encodeURIComponent': '1',
        'step': '1',
        'firstin': 'true',
        'Stp': '4',
        'go': 'false',
        'r1': '2',
        'co_id': co_id,
        'KIND': 'L',
        'keyWord': '自結',
        'Condition2': '1',
        'year': year,
        'month1': month,
        'begin_day': begin_day,
        'end_day': end_day
    }
    return f"{base_url}?{urlencode(params)}"

def extract_post_parameters(soup):
    """從初始HTML中提取詳細資料的POST參數。"""
    regex = re.compile(r'document\.fm\.(\w+)\.value="([^"]+)"')
    post_buttons = soup.find_all('input', {'type': 'button', 'onclick': True})
    extracted_params = []

    for button in post_buttons:
        matches = regex.findall(button.get('onclick', ''))
        if matches:
            params_dict = {param: value for param, value in matches}
            required_keys = ['seq_no', 'spoke_time', 'spoke_date', 'i', 'co_id', 'TYPEK']
            if all(key in params_dict for key in required_keys):
                extracted_params.append(params_dict)

    logger.info(f"成功提取到 {len(extracted_params)} 筆POST參數")
    return extracted_params

def create_company_dir(base_directory='各金控重訊', co_id=''):
    """建立公司代碼的資料夾，若已存在則不執行任何操作。"""
    directory = os.path.join(base_directory, co_id)
    os.makedirs(directory, exist_ok=True)
    return directory

def build_detail_url(params_dict):
    """根據提取的參數建立詳細資料的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t05st01"
    query_params = {
        'step': '2',
        'colorchg': '1',
        'off': '1',
        'firstin': '1'
    }
    query_params.update(params_dict)
    return f"{base_url}?{urlencode(query_params)}"

def save_html(content, directory, stock_code, announce_date):
    """將HTML內容保存為公司資料夾中的文件。"""
    filename = f"{stock_code}_detail_{announce_date}.html"
    file_path = os.path.join(directory, filename)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)
    logger.info(f"成功儲存HTML內容到 {file_path}")

def fetch_detail_page(session, detail_url):
    """發送POST請求並返回詳細頁面的HTML內容。"""
    try:
        response = session.post(detail_url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"發送POST請求失敗: {e}")
        return None

def main():
    # 要查詢的公司代碼列表'2880', '2881', '2882', '2883', '2884', '2885',  '2886', '2887', '2888', '2889', '2890', '2891', '2892', '5880'
    
    co_ids = ['2880']

    # 重大資訊查詢設定
    search_keyword = "自結"  
    search_year = '112'
    search_month = '12'
    search_begin_day = '05'
    search_end_day = '12'

    session = setup_session()

    # 遍歷所有公司代碼
    for co_id in co_ids:
        logger.info(f"開始處理公司代碼: {co_id}")
        print(f"目前處理公司代碼: {co_id}")

        # 建立該公司的資料夾
        company_dir = create_company_dir(co_id=co_id)

        # 建立初始查詢的URL
        initial_url = build_initial_url(
            co_id, search_keyword, search_year, search_month, 
            search_begin_day, search_end_day
        )

        try:
            response_initial = session.get(initial_url)
            response_initial.raise_for_status()
            logger.info(f"成功下載 {co_id} 的初始頁面")
        except requests.RequestException as e:
            logger.error(f"下載初始頁面失敗（{co_id}）: {e}")
            continue

        soup_initial = BeautifulSoup(response_initial.text, 'html.parser')
        post_params_list = extract_post_parameters(soup_initial)

        if not post_params_list:
            logger.error(f"{co_id} 未提取到任何有效的POST參數，跳過此公司")
            continue

        # 遍歷該公司代碼的所有POST參數
        for idx, params_dict in enumerate(post_params_list, start=1):
            logger.info(f"處理第 {idx} 筆資料: {params_dict['co_id']}")
            print(f"目前進度: {idx} / {len(post_params_list)}")

            detail_url = build_detail_url(params_dict)
            html_detail = fetch_detail_page(session, detail_url)

            if not html_detail:
                logger.warning(f"無法下載詳細頁面，跳過: {params_dict['co_id']}")
                continue

            announce_date = params_dict['spoke_date']
            stock_code = params_dict['co_id']
            save_html(html_detail, company_dir, stock_code, announce_date)

            time.sleep(5)  # 避免伺服器封鎖

    logger.info("所有資料下載完成")
    print("所有資料下載完成")

if __name__ == "__main__":
    main()

目前處理公司代碼: 2880
目前進度: 1 / 1
所有資料下載完成


In [17]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from urllib.parse import urlencode

def setup_session():
    """建立並返回一個設定好的Session物件。"""
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
        ),
        'Referer': 'https://mops.twse.com.tw/mops/web/t51sb10'
    }
    session = requests.Session()
    session.headers.update(headers)
    return session

def build_initial_url(co_id, search_keyword, year, month, begin_day, end_day):
    """根據搜尋參數建立初始查詢的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t51sb10"
    params = {
        'encodeURIComponent': '1',
        'step': '1',
        'firstin': 'true',
        'Stp': '4',
        'go': 'false',
        'r1': '2',
        'co_id': co_id,
        'KIND': 'L',
        'keyWord': '自結',
        'Condition2': '1',
        'year': year,
        'month1': month,
        'begin_day': begin_day,
        'end_day': end_day
    }
    return f"{base_url}?{urlencode(params)}"

def extract_post_parameters(soup):
    """從初始HTML中提取詳細資料的POST參數。"""
    regex = re.compile(r'document\.fm\.(\w+)\.value="([^"]+)"')
    post_buttons = soup.find_all('input', {'type': 'button', 'onclick': True})
    extracted_params = []

    for button in post_buttons:
        matches = regex.findall(button.get('onclick', ''))
        if matches:
            params_dict = {param: value for param, value in matches}
            required_keys = ['seq_no', 'spoke_time', 'spoke_date', 'i', 'co_id', 'TYPEK']
            if all(key in params_dict for key in required_keys):
                extracted_params.append(params_dict)

    return extracted_params

def create_company_dir(base_directory='各金控重訊', co_id=''):
    """建立公司代碼的資料夾，若已存在則不執行任何操作。"""
    directory = os.path.join(base_directory, co_id)
    os.makedirs(directory, exist_ok=True)
    return directory

def build_detail_url(params_dict):
    """根據提取的參數建立詳細資料的URL。"""
    base_url = "https://mops.twse.com.tw/mops/web/ajax_t05st01"
    query_params = {
        'step': '2',
        'colorchg': '1',
        'off': '1',
        'firstin': '1'
    }
    query_params.update(params_dict)
    return f"{base_url}?{urlencode(query_params)}"

def fetch_detail_page(session, detail_url):
    """發送POST請求並返回詳細頁面的HTML內容。"""
    try:
        response = session.post(detail_url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        return None

def main():
    co_ids = ['2880']

    # 重大資訊查詢設定
    search_keyword = "自結"  
    search_year = '112'
    search_month = '11'
    search_begin_day = '05'
    search_end_day = '12'

    session = setup_session()

    # 遍歷所有公司代碼
    for co_id in co_ids:
        print(f"目前處理公司代碼: {co_id}")

        # 建立該公司的資料夾
        company_dir = create_company_dir(co_id=co_id)

        # 建立初始查詢的URL
        initial_url = build_initial_url(
            co_id, search_keyword, search_year, search_month, 
            search_begin_day, search_end_day
        )

        try:
            response_initial = session.get(initial_url)
            response_initial.raise_for_status()
        except requests.RequestException as e:
            continue

        soup_initial = BeautifulSoup(response_initial.text, 'html.parser')
        post_params_list = extract_post_parameters(soup_initial)

        if not post_params_list:
            continue

        # 遍歷該公司代碼的所有POST參數
        for idx, params_dict in enumerate(post_params_list, start=1):
            print(f"目前進度: {idx} / {len(post_params_list)}")

            detail_url = build_detail_url(params_dict)
            html_detail = fetch_detail_page(session, detail_url)

            if not html_detail:
                continue
            
            time.sleep(5)  # 避免伺服器封鎖

    print("所有資料下載完成")

if __name__ == "__main__":
    main()

目前處理公司代碼: 2880
目前進度: 1 / 1
所有資料下載完成


In [16]:
from lxml import etree
import pandas as pd
import re

for page in html_detail:
    print(html = file.read(page))

htm = etree.HTML(html)
data = htm.xpath("//pre/text()")[0]

# 用正則表達式提取表格數據
pattern = re.compile(r'(\S+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
matches = pattern.findall(data)

# 將匹配結果轉換為 DataFrame
columns = ['公司', '自結稅前淨利(億元)', '自結稅後淨利(億元)', '累計稅前淨利(億元)', '累計稅後淨利(億元)', '每股稅前盈餘(元)', '每股稅後盈餘(元)']
df = pd.DataFrame(matches, columns=columns)#取第一行，第三列.iloc[0, 3] 
df

NameError: name 'html_detail' is not defined

In [8]:
import yfinance as yf

# Downloading the ticker data
ticker = yf.Ticker("2880.tw")

# Getting the quarterly balance sheet, financials, and cashflow
qbs = ticker.financials
selected_items = ["Net Income", "Diluted EPS"]
df_income = qbs.loc[selected_items].T
df_income
#selected_items = ["Total Revenue", "Gross Profit", "Net Income", "Diluted EPS"]

Unnamed: 0,Net Income,Diluted EPS
2023-12-31,21618294000.0,1.564356
2022-12-31,17308343000.0,1.257426
2021-12-31,17206199000.0,1.247525
2020-12-31,8653353000.0,0.628002
