決算資料のpdfをダウンロード

In [None]:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import os
import re
import time
import sys


def create_filename(target_pdf_link):
    '''
    保存する時のファイル名の作成
    '''
    original = os.path.basename(target_pdf_link.get('href', ''))
    if '_' in original:
        filename = original.split('_')[0]        # '_'以下の文字列を削除
    else:
        filename = os.path.splitext(original)[0] #'.pdf'を削除
    filename += '_'
    filename += target_pdf_link.find(text=True, recursive=False).strip()
    filename = re.sub(r'\s+', '_', filename.strip())
    filename = re.sub(r'[^\w\u3040-\u30ff\u4e00-\u9fff_]', '', filename)
    filename += '.pdf'
    return(filename)

def dl_pdf(base_url, pdf_links, save_dir):
    for target_pdf_link in pdf_links:
        target_url = urljoin(base_url, target_pdf_link.get('href', ''))
        filename = create_filename(target_pdf_link)
        print(filename)
        save_path = os.path.join(save_dir, filename)
        try:
            res = requests.get(target_url)
            res.raise_for_status()
            with open(save_path, 'wb') as f:
                f.write(res.content)
            print(f'DLファイル: {filename}')
        except Exception as e:
            print(f"DL失敗 {filename}: {e}")
    return

def download_irdeta(page_path, DLfile_keyword):
    '''
    DLfile_keyword: ダウンロード対象ファイルのリンクに含まれるテキスト 
    page_path     : クロールしたいページのbase_url以降
    '''
    save_dir = os.path.join(os.getcwd(), 'irdata')
    os.makedirs(save_dir, exist_ok=True)
    base_url = 'https://www.nintendo.co.jp'
    full_url =  urljoin(base_url, page_path)

    options = Options()
    options.add_argument('--headless')
    # -- for google colab --
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=options)
    try:
        driver.get(full_url)
    except Exception as e:
        print(f"driver.getが失敗: {full_url}")
        print(e)
        driver.quit()
        sys.exit()
    time.sleep(1)
    try:
        select_elem = driver.find_element(By.TAG_NAME, 'select')
        select = Select(select_elem)
    except NoSuchElementException as e:
        print("selectタグが見つかりません")
        print(e)
        driver.quit()
        return
    for op in select.options:
        val = op.get_attribute('Value')
        if val and val != 'latest':
        # if val and val == '2025':
            select.select_by_value(val)
            print(f'アクセス年度: {val}年度')
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            links = soup.find_all('a')
            pdf_links = [pdf_link for pdf_link in links \
                        if pdf_link.get('href', '').endswith('.pdf') and (DLfile_keyword in pdf_link.get_text() or '説明会資料' in pdf_link.get_text())]
            html_links = [html_link for html_link in links \
                        if html_link.get('href', '').endswith('.html') and (DLfile_keyword in html_link.get_text() or '説明会資料' in html_link.get_text())]
            dl_pdf(base_url, pdf_links, save_dir)
            # dl_html(base_url, html_links, save_dir, options)
    driver.quit()
    

In [None]:
# download_irdeta(page_path='/ir/events/index.html', DLfile_keyword='説明資料')
# time.sleep(1)
# download_irdeta(page_path='/ir/events/index.html', DLfile_keyword='説明会資料')
# time.sleep(1)
# download_irdeta(page_path='/ir/library/earnings/index.html', DLfile_keyword='説明会資料')
# time.sleep(1)
download_irdeta(page_path='/ir/library/earnings/index.html', DLfile_keyword='参考資料')
