In [1]:
import json
import requests
from bs4 import BeautifulSoup
import psycopg2
from datetime import datetime
import time

In [2]:
# ① DB接続情報をJSON形式のconfig.jsonから読み込む
with open("config.json", "r") as f:
    config = json.load(f)
DB_NAME = config["DB_NAME"]
DB_USER = config["DB_USER"]
DB_PASSWORD = config["DB_PASSWORD"]
DB_HOST = config["DB_HOST"]
DB_PORT = config.get("DB_PORT", "5432")

In [5]:
# ② 選手一覧のインデックスページ一覧（50音順：あ い う え お … わ）
letter_pages = [
    "index_a.html", "index_i.html", "index_u.html", "index_e.html", "index_o.html",
    "index_ka.html", "index_ki.html", "index_ku.html", "index_ke.html", "index_ko.html",
    "index_sa.html", "index_si.html", "index_su.html", "index_se.html", "index_so.html",
    "index_ta.html", "index_ti.html", "index_tu.html", "index_te.html", "index_to.html",
    "index_na.html", "index_ni.html", "index_nu.html", "index_ne.html", "index_no.html",
    "index_ha.html", "index_hi.html", "index_hu.html", "index_he.html", "index_ho.html",
    "index_ma.html", "index_mi.html", "index_mu.html", "index_me.html", "index_mo.html",
    "index_ya.html", "index_yu.html", "index_yo.html",
    "index_ra.html", "index_ri.html", "index_ru.html", "index_re.html", "index_ro.html",
    "index_wa.html"
]
base_index_url = "https://npb.jp/bis/players/all/"

# 全選手の個別ページURLを保持するリスト
all_player_urls = []

In [6]:
# ③ 各インデックスページを巡回して個別ページのURLを収集
for page in letter_pages:
    index_url = base_index_url + page
    try:
        response = requests.get(index_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Indexページ取得エラー: {index_url} - {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    # aタグのhrefが'/bis/players/'で始まるものを抽出
    links = soup.find_all('a', href=True)
    page_player_urls = ["https://npb.jp" + link['href'] for link in links if link['href'].startswith('/bis/players/')]
    for player_url in page_player_urls:
        if player_url not in all_player_urls:
            all_player_urls.append(player_url)
    # インデックスページ間は1秒待機
    time.sleep(1)

print(f"全{len(all_player_urls)}件の選手個別ページURLを収集しました。")

全7776件の選手個別ページURLを収集しました。


In [11]:
import csv

# CSV ファイルに出力
csv_filename = "all_player_urls.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    # ヘッダー行を書き込む
    writer.writerow(["url"])
    # 各URLを書き込む
    for url in all_player_urls:
        writer.writerow([url])
        
print(f"{csv_filename} に全選手URLを出力しました。")

# CSV ファイルから読み込み
urls_from_csv = []
with open(csv_filename, mode="r", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        urls_from_csv.append(row["url"])

print("CSVから読み込んだURLリスト:", urls_from_csv)


all_player_urls.csv に全選手URLを出力しました。
CSVから読み込んだURLリスト: ['https://npb.jp/bis/players/', 'https://npb.jp/bis/players/all/index.html', 'https://npb.jp/bis/players/31133846.html', 'https://npb.jp/bis/players/93595153.html', 'https://npb.jp/bis/players/17313868.html', 'https://npb.jp/bis/players/31133867.html', 'https://npb.jp/bis/players/51653882.html', 'https://npb.jp/bis/players/43545157.html', 'https://npb.jp/bis/players/91895118.html', 'https://npb.jp/bis/players/53555159.html', 'https://npb.jp/bis/players/31133825.html', 'https://npb.jp/bis/players/71675137.html', 'https://npb.jp/bis/players/63565157.html', 'https://npb.jp/bis/players/31133829.html', 'https://npb.jp/bis/players/23925150.html', 'https://npb.jp/bis/players/43345114.html', 'https://npb.jp/bis/players/83985132.html', 'https://npb.jp/bis/players/81685116.html', 'https://npb.jp/bis/players/63763868.html', 'https://npb.jp/bis/players/31133864.html', 'https://npb.jp/bis/players/31135134.html', 'https://npb.jp/bis/players/71173

In [8]:
# ④ DB接続
conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

# ⑤ 各選手個別ページにアクセスしてDBへ登録
for url in all_player_urls:
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"URL取得エラー: {url} - {e}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    # URL末尾の数字を選手IDとして抽出
    player_id = url.rstrip(".html").split("/")[-1]

    try:
        # ■ 選手基本情報（pc_vitalsセクション）を抽出
        vitals = soup.find("section", id="pc_vitals")
        if vitals:
            jersey_number = vitals.find("li", id="pc_v_no").text.strip() if vitals.find("li", id="pc_v_no") else ""
            player_name = vitals.find("li", id="pc_v_name").text.strip() if vitals.find("li", id="pc_v_name") else ""
            player_name_kana = vitals.find("li", id="pc_v_kana").text.strip() if vitals.find("li", id="pc_v_kana") else ""
        else:
            jersey_number = player_name = player_name_kana = ""
        
        # ■ 個人情報（pc_bioセクション内のテーブル）を抽出
        bio_section = soup.find("section", id="pc_bio")
        info = {}
        if bio_section:
            rows = bio_section.find("table").find_all("tr")
            for row in rows:
                th = row.find("th")
                td = row.find("td")
                if th and td:
                    label = th.text.strip()
                    value = " ".join(td.get_text(strip=True).split())
                    info[label] = value

        # 各項目が存在しない場合は空文字にする
        position = info.get("ポジション", "")
        throwing_batting = info.get("投打", "")
        if len(throwing_batting) >= 4:
            throwing_hand = throwing_batting[:2]
            batting_hand = throwing_batting[2:]
        else:
            throwing_hand = throwing_batting
            batting_hand = ""
        physical_info = info.get("身長／体重", "")
        height_cm, weight_kg = None, None
        if "／" in physical_info:
            height_part, weight_part = [s.strip() for s in physical_info.split("／")]
            try:
                height_cm = int(height_part.replace("cm", ""))
            except:
                height_cm = None
            try:
                weight_kg = int(weight_part.replace("kg", ""))
            except:
                weight_kg = None
        birthdate_str = info.get("生年月日", "")
        try:
            birthdate = datetime.strptime(birthdate_str, "%Y年%m月%d日").date() if birthdate_str else None
        except Exception as e:
            birthdate = None
        career = info.get("経歴", "")
        draft = info.get("ドラフト", "")
    except Exception as e:
        print(f"情報抽出エラー: {url} - {e}")
        continue

    # 既に登録済みの場合は削除（再実行を考慮）
    try:
        cursor.execute("DELETE FROM player_personal_info WHERE player_id = %s", (player_id,))
        conn.commit()
    except Exception as e:
        conn.rollback()
        print(f"レコード削除エラー: {player_id} - {e}")
        continue

    # DBへの登録
    sql = """
    INSERT INTO player_personal_info 
      (player_id, player_name, player_name_kana, jersey_number, position, batting_hand, throwing_hand, height_cm, weight_kg, birthdate, career, draft)
    VALUES 
      (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    try:
        cursor.execute(sql, (
            player_id, player_name, player_name_kana, jersey_number,
            position, batting_hand, throwing_hand,
            height_cm, weight_kg, birthdate, career, draft
        ))
        conn.commit()
        print(f"登録完了: {player_id}")
    except Exception as e:
        conn.rollback()
        print(f"DB登録エラー: {player_id} - {e}")

    # 各選手ページの処理後に1秒待機
    time.sleep(1)

cursor.close()
conn.close()

登録完了: 
登録完了: index
登録完了: 31133846
登録完了: 93595153
登録完了: 17313868
登録完了: 31133867
登録完了: 51653882
登録完了: 43545157
登録完了: 91895118
登録完了: 53555159
登録完了: 31133825
登録完了: 71675137
登録完了: 63565157
登録完了: 31133829
登録完了: 23925150
登録完了: 43345114
登録完了: 83985132
登録完了: 81685116
登録完了: 63763868
登録完了: 31133864
登録完了: 31135134
登録完了: 71173862
登録完了: 73075150
登録完了: 13315116
登録完了: 91695137
登録完了: 03105137
登録完了: 73575159
登録完了: 11413889
登録完了: 31133849
登録完了: 23325132
登録完了: 03305134
登録完了: 41445111
登録完了: 51153849
登録完了: 03505153
登録完了: 83585155
登録完了: 03305136
登録完了: 31133826
登録完了: 33535150
登録完了: 23325136
登録完了: 61865133
登録完了: 73075153
登録完了: 23325118
登録完了: 33535132
登録完了: 33735155
登録完了: 63365113
登録完了: 23125110
登録完了: 43345136
登録完了: 93595159
登録完了: 31133808
登録完了: 53355117
登録完了: 31133865
登録完了: 03403868
登録完了: 33135114
登録完了: 23525135
登録完了: 83183886
登録完了: 63365139
登録完了: 51353883
登録完了: 11815139
登録完了: 53755155
登録完了: 73775155
登録完了: 31133840
登録完了: 83585157
登録完了: 01605113
登録完了: 51153860
登録完了: 51153809
登録完了: 43145137
登録完了: 31133822
登録完了: 