In [9]:
import json
import requests
from bs4 import BeautifulSoup
import psycopg2
from datetime import datetime
import time
import sys
import logging

# ヘルパー関数：数値変換（変換できない場合はNone）
def safe_int(text):
    try:
        return int(text)
    except:
        return None

def safe_float(text):
    try:
        return float(text)
    except:
        return None

# ① DB接続情報を config.json から読み込む
with open("config.json", "r") as f:
    config = json.load(f)
DB_NAME = config["DB_NAME"]
DB_USER = config["DB_USER"]
DB_PASSWORD = config["DB_PASSWORD"]
DB_HOST = config["DB_HOST"]
DB_PORT = config.get("DB_PORT", "5432")

# ② DB接続
conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

# ③ 対象選手のページURL（サンプル）
url = "https://npb.jp/bis/players/81985133.html"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# ④ 個人情報の抽出
# player_id はURL末尾の数字
player_id = url.rstrip(".html").split("/")[-1]



# ⑥ 投手成績の抽出・登録
pitching_div = soup.find("div", id="stats_p")
if pitching_div:
    table_p = pitching_div.find("table", id="tablefix_p")
    if table_p:
        rows_p = []
        # tbody の通常行
        tbody_p = table_p.find("tbody")
        if tbody_p:
            rows_p.extend(tbody_p.find_all("tr", class_="registerStats"))
        # tfoot の通算成績行
        tfoot_p = table_p.find("tfoot")
        if tfoot_p:
            rows_p.extend(tfoot_p.find_all("tr"))
        for row in rows_p:
            # 直接の子要素として<td>と<th>の両方を取得
            cells = row.find_all(["td", "th"], recursive=False)
            if len(cells) < 24:
                continue
            # 年度のセルが空の場合は通算成績として扱う
            if not cells[0].get_text(strip=True):
                p_year = 9999
            else:
                try:
                    p_year = int(cells[0].get_text(strip=True))
                except:
                    continue
            # 所属球団が「通　算」なら空文字とする
            if cells[1].get_text(strip=True) == "通　算":
                p_team = ""
            else:
                p_team = cells[1].get_text(strip=True)
            appearances = safe_int(cells[2].get_text(strip=True))
            wins = safe_int(cells[3].get_text(strip=True))
            losses = safe_int(cells[4].get_text(strip=True))
            saves = safe_int(cells[5].get_text(strip=True))
            p_H = safe_int(cells[6].get_text(strip=True))
            p_HP = safe_int(cells[7].get_text(strip=True))
            complete_games = safe_int(cells[8].get_text(strip=True))
            shutouts = safe_int(cells[9].get_text(strip=True))
            no_walk_games = safe_int(cells[10].get_text(strip=True))
            win_rate = safe_float(cells[11].get_text(strip=True))
            batters_faced = safe_int(cells[12].get_text(strip=True))
            # 投球回はセル13内のネストしたテーブルから取得
            ip_cell = cells[13]
            ip_table = ip_cell.find("table", class_="table_inning")
            if ip_table:
                tr_ip = ip_table.find("tr")
                ip_int = tr_ip.find("th").get_text(strip=True) if tr_ip.find("th") else "0"
                ip_frac = tr_ip.find("td").get_text(strip=True) if tr_ip.find("td") else "0"
                innings_pitched = safe_float(ip_int + ip_frac)
            else:
                innings_pitched = None
            hits_allowed = safe_int(cells[14].get_text(strip=True))
            home_runs_allowed = safe_int(cells[15].get_text(strip=True))
            walks = safe_int(cells[16].get_text(strip=True))
            hit_by_pitch = safe_int(cells[17].get_text(strip=True))
            strikeouts = safe_int(cells[18].get_text(strip=True))
            wild_pitches = safe_int(cells[19].get_text(strip=True))
            balks = safe_int(cells[20].get_text(strip=True))
            runs_allowed = safe_int(cells[21].get_text(strip=True))
            earned_runs = safe_int(cells[22].get_text(strip=True))
            ERA = safe_float(cells[23].get_text(strip=True))
            
            # 既存レコードは削除してからINSERT
            try:
                cursor.execute("DELETE FROM player_pitching_stats WHERE player_id = %s AND year = %s AND team = %s", (player_id, p_year, p_team))
                conn.commit()
            except Exception as e:
                conn.rollback()
                print(f"投手成績レコード削除エラー: {player_id} 年度 {p_year} - {e}")
            
            sql_pitch = """
            INSERT INTO player_pitching_stats 
              (player_id, year, team, appearances, wins, losses, saves, H, HP, complete_games, shutouts, no_walk_games, win_rate, batters_faced, innings_pitched, hits_allowed, home_runs_allowed, walks, hit_by_pitch, strikeouts, wild_pitches, balks, runs_allowed, earned_runs, ERA)
            VALUES 
              (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            cursor.execute(sql_pitch, (
                player_id, p_year, p_team, appearances, wins, losses, saves, p_H, p_HP, complete_games,
                shutouts, no_walk_games, win_rate, batters_faced, innings_pitched, hits_allowed, home_runs_allowed,
                walks, hit_by_pitch, strikeouts, wild_pitches, balks, runs_allowed, earned_runs, ERA
            ))
            conn.commit()
            print(f"投手成績登録完了: {player_id} 年度 {p_year}")
    else:
        print("投手成績テーブルが見つかりませんでした。")
else:
    print("投手成績セクションは存在しません。")

# ⑦ 打撃成績の抽出・登録（こちらは従来通り<td>と<th>の両方を取得）
batting_div = soup.find("div", id="stats_b")
if batting_div:
    table_b = batting_div.find("table", id="tablefix_b")
    if table_b:
        rows_b = []
        tbody_b = table_b.find("tbody")
        if tbody_b:
            rows_b.extend(tbody_b.find_all("tr", class_="registerStats", recursive=False))
        tfoot_b = table_b.find("tfoot")
        if tfoot_b:
            rows_b.extend(tfoot_b.find_all("tr", recursive=False))
        for row in rows_b:
            cells = row.find_all(["td", "th"], recursive=False)
            if len(cells) < 23:
                continue
            if not cells[0].get_text(strip=True):
                b_year = 9999
            else:
                try:
                    b_year = int(cells[0].get_text(strip=True))
                except:
                    continue
            if cells[1].get_text(strip=True) == "通　算":
                b_team = ""
            else:
                b_team = cells[1].get_text(strip=True)
            games = safe_int(cells[2].get_text(strip=True))
            plate_appearances = safe_int(cells[3].get_text(strip=True))
            at_bats = safe_int(cells[4].get_text(strip=True))
            runs = safe_int(cells[5].get_text(strip=True))
            hits = safe_int(cells[6].get_text(strip=True))
            doubles = safe_int(cells[7].get_text(strip=True))
            triples = safe_int(cells[8].get_text(strip=True))
            home_runs = safe_int(cells[9].get_text(strip=True))
            total_bases = safe_int(cells[10].get_text(strip=True))
            RBI = safe_int(cells[11].get_text(strip=True))
            stolen_bases = safe_int(cells[12].get_text(strip=True))
            caught_stealing = safe_int(cells[13].get_text(strip=True))
            sacrifice_hit = safe_int(cells[14].get_text(strip=True))
            sacrifice_fly = safe_int(cells[15].get_text(strip=True))
            walks_b = safe_int(cells[16].get_text(strip=True))
            hit_by_pitch_b = safe_int(cells[17].get_text(strip=True))
            strikeouts_b = safe_int(cells[18].get_text(strip=True))
            grounded_into_dp = safe_int(cells[19].get_text(strip=True))
            batting_average = safe_float(cells[20].get_text(strip=True))
            slugging_percentage = safe_float(cells[21].get_text(strip=True))
            on_base_percentage = safe_float(cells[22].get_text(strip=True))
            
            try:
                cursor.execute("DELETE FROM player_batting_stats WHERE player_id = %s AND year = %s AND team = %s", (player_id, b_year, b_team))
                conn.commit()
            except Exception as e:
                conn.rollback()
                print(f"打撃成績レコード削除エラー: {player_id} 年度 {b_year} - {e}")
            
            sql_bat = """
            INSERT INTO player_batting_stats
              (player_id, year, team, games, plate_appearances, at_bats, runs, hits, doubles, triples, home_runs, total_bases, RBI, stolen_bases, caught_stealing, sacrifice_hit, sacrifice_fly, walks, hit_by_pitch, strikeouts, grounded_into_dp, batting_average, slugging_percentage, on_base_percentage)
            VALUES 
              (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            cursor.execute(sql_bat, (
                player_id, b_year, b_team, games, plate_appearances, at_bats, runs, hits, doubles,
                triples, home_runs, total_bases, RBI, stolen_bases, caught_stealing, sacrifice_hit,
                sacrifice_fly, walks_b, hit_by_pitch_b, strikeouts_b, grounded_into_dp, batting_average,
                slugging_percentage, on_base_percentage
            ))
            conn.commit()
            print(f"打撃成績登録完了: {player_id} 年度 {b_year}")
    else:
        print("打撃成績テーブルが見つかりませんでした。")
else:
    print("打撃成績セクションは存在しません。")

# ⑧ 最後にDB接続をクローズ
cursor.close()
conn.close()


投手成績登録完了: 81985133 年度 2011
投手成績登録完了: 81985133 年度 2012
投手成績登録完了: 81985133 年度 2013
投手成績登録完了: 81985133 年度 2014
投手成績登録完了: 81985133 年度 2015
投手成績登録完了: 81985133 年度 2016
投手成績登録完了: 81985133 年度 2018
投手成績登録完了: 81985133 年度 2019
投手成績登録完了: 81985133 年度 2020
投手成績登録完了: 81985133 年度 2020
投手成績登録完了: 81985133 年度 2023
投手成績登録完了: 81985133 年度 2024
投手成績登録完了: 81985133 年度 9999
打撃成績登録完了: 81985133 年度 2011
打撃成績登録完了: 81985133 年度 2012
打撃成績登録完了: 81985133 年度 2013
打撃成績登録完了: 81985133 年度 2014
打撃成績登録完了: 81985133 年度 2015
打撃成績登録完了: 81985133 年度 2016
打撃成績登録完了: 81985133 年度 2018
打撃成績登録完了: 81985133 年度 2019
打撃成績登録完了: 81985133 年度 2020
打撃成績登録完了: 81985133 年度 2020
打撃成績登録完了: 81985133 年度 2023
打撃成績登録完了: 81985133 年度 2024
打撃成績登録完了: 81985133 年度 9999


In [2]:
import json
import requests
from bs4 import BeautifulSoup
import psycopg2
from datetime import datetime
import time
import sys
import logging

In [3]:
# ② 選手一覧のインデックスページ一覧（50音順：あ い う え お … わ）
letter_pages = [
    "index_a.html", "index_i.html", "index_u.html", "index_e.html", "index_o.html",
    "index_ka.html", "index_ki.html", "index_ku.html", "index_ke.html", "index_ko.html",
    "index_sa.html", "index_si.html", "index_su.html", "index_se.html", "index_so.html",
    "index_ta.html", "index_ti.html", "index_tu.html", "index_te.html", "index_to.html",
    "index_na.html", "index_ni.html", "index_nu.html", "index_ne.html", "index_no.html",
    "index_ha.html", "index_hi.html", "index_hu.html", "index_he.html", "index_ho.html",
    "index_ma.html", "index_mi.html", "index_mu.html", "index_me.html", "index_mo.html",
    "index_ya.html", "index_yu.html", "index_yo.html",
    "index_ra.html", "index_ri.html", "index_ru.html", "index_re.html", "index_ro.html",
    "index_wa.html"
]
base_index_url = "https://npb.jp/bis/players/all/"

# 全選手の個別ページURLを保持するリスト
all_player_urls = []

In [4]:
# ③ 各インデックスページを巡回して個別ページのURLを収集
for page in letter_pages:
    index_url = base_index_url + page
    try:
        response = requests.get(index_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Indexページ取得エラー: {index_url} - {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    # aタグのhrefが'/bis/players/'で始まるものを抽出
    links = soup.find_all('a', href=True)
    page_player_urls = ["https://npb.jp" + link['href'] for link in links if link['href'].startswith('/bis/players/')]
    for player_url in page_player_urls:
        if player_url not in all_player_urls:
            all_player_urls.append(player_url)
    # インデックスページ間は1秒待機
    time.sleep(1)

print(f"全{len(all_player_urls)}件の選手個別ページURLを収集しました。")

全7776件の選手個別ページURLを収集しました。


In [5]:
# ログの設定（ログファイル "process.log" に書き出す）
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler("process.log", mode="w", encoding="utf-8")
    ]
)

In [6]:
# ヘルパー関数：テキストが空欄の場合は0（整数）または0.0（浮動小数点数）を返す
def safe_int(text):
    text = text.strip()
    if text == "":
        return 0
    try:
        return int(text)
    except:
        return 0

def safe_float(text):
    text = text.strip()
    if text == "":
        return 0.0
    try:
        return float(text)
    except:
        return 0.0

In [10]:
# DB接続
conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

total = len(all_player_urls)
logging.info(f"全 {total} 件の選手ページの処理を開始します。")

for idx, url in enumerate(all_player_urls, start=1):
    # 現在の進捗をコンソール上で1行に表示（上書き）
    progress_msg = f"処理中: {idx}/{total} - {url}"
    sys.stdout.write("\r" + progress_msg)
    sys.stdout.flush()
    
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        logging.error(f"URL取得エラー: {url} - {e}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    # player_id は URL の末尾の数字
    current_player_id = url.rstrip(".html").split("/")[-1]

    # --- 投手成績の抽出・登録 ---
    pitching_div = soup.find("div", id="stats_p")
    if pitching_div:
        table_p = pitching_div.find("table", id="tablefix_p")
        if table_p:
            rows_p = []
            tbody_p = table_p.find("tbody")
            if tbody_p:
                rows_p.extend(tbody_p.find_all("tr", class_="registerStats"))
            tfoot_p = table_p.find("tfoot")
            if tfoot_p:
                rows_p.extend(tfoot_p.find_all("tr"))
            for row in rows_p:
                # td, th の直接の子要素を取得
                cells = row.find_all(["td", "th"], recursive=False)
                if len(cells) < 24:
                    continue
                # 年度が空なら通算行として扱う
                if not cells[0].get_text(strip=True):
                    p_year = 9999
                else:
                    try:
                        p_year = int(cells[0].get_text(strip=True))
                    except:
                        continue
                # 所属球団が「通　算」なら空文字
                if cells[1].get_text(strip=True) == "通　算":
                    p_team = ""
                else:
                    p_team = cells[1].get_text(strip=True)
                appearances      = safe_int(cells[2].get_text())
                wins             = safe_int(cells[3].get_text())
                losses           = safe_int(cells[4].get_text())
                saves            = safe_int(cells[5].get_text())
                p_H              = safe_int(cells[6].get_text())
                p_HP             = safe_int(cells[7].get_text())
                complete_games   = safe_int(cells[8].get_text())
                shutouts         = safe_int(cells[9].get_text())
                no_walk_games    = safe_int(cells[10].get_text())
                win_rate         = safe_float(cells[11].get_text())
                batters_faced    = safe_int(cells[12].get_text())
                # 投球回は内部のネストしたテーブルから取得
                ip_cell = cells[13]
                ip_table = ip_cell.find("table", class_="table_inning")
                if ip_table:
                    tr_ip = ip_table.find("tr")
                    ip_int = tr_ip.find("th").get_text(strip=True) if tr_ip.find("th") else "0"
                    ip_frac = tr_ip.find("td").get_text(strip=True) if tr_ip.find("td") else "0"
                    innings_pitched = safe_float(ip_int + ip_frac)
                else:
                    innings_pitched = 0.0
                hits_allowed      = safe_int(cells[14].get_text())
                home_runs_allowed = safe_int(cells[15].get_text())
                walks             = safe_int(cells[16].get_text())
                hit_by_pitch      = safe_int(cells[17].get_text())
                strikeouts        = safe_int(cells[18].get_text())
                wild_pitches      = safe_int(cells[19].get_text())
                balks             = safe_int(cells[20].get_text())
                runs_allowed      = safe_int(cells[21].get_text())
                earned_runs       = safe_int(cells[22].get_text())
                ERA               = safe_float(cells[23].get_text())
                
                try:
                    cursor.execute("DELETE FROM player_pitching_stats WHERE player_id = %s AND year = %s AND team = %s", (player_id, p_year, p_team))
                    conn.commit()
                except Exception as e:
                    conn.rollback()
                    logging.error(f"投手成績レコード削除エラー: {current_player_id} 年度 {p_year} - {e}")
                
                sql_pitch = """
                INSERT INTO player_pitching_stats 
                  (player_id, year, team, appearances, wins, losses, saves, H, HP, complete_games, shutouts, no_walk_games, win_rate, batters_faced, innings_pitched, hits_allowed, home_runs_allowed, walks, hit_by_pitch, strikeouts, wild_pitches, balks, runs_allowed, earned_runs, ERA)
                VALUES 
                  (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """
                cursor.execute(sql_pitch, (
                    current_player_id, p_year, p_team, appearances, wins, losses, saves, p_H, p_HP, complete_games,
                    shutouts, no_walk_games, win_rate, batters_faced, innings_pitched, hits_allowed, home_runs_allowed,
                    walks, hit_by_pitch, strikeouts, wild_pitches, balks, runs_allowed, earned_runs, ERA
                ))
                conn.commit()
                logging.info(f"投手成績登録完了: {current_player_id} 年度 {p_year}")
        else:
            logging.info(f"投手成績テーブルが見つかりませんでした: {url}")
    else:
        logging.info(f"投手成績セクションは存在しません: {url}")

    # --- 打撃成績の抽出・登録 ---
    batting_div = soup.find("div", id="stats_b")
    if batting_div:
        table_b = batting_div.find("table", id="tablefix_b")
        if table_b:
            rows_b = []
            tbody_b = table_b.find("tbody")
            if tbody_b:
                rows_b.extend(tbody_b.find_all("tr", class_="registerStats", recursive=False))
            tfoot_b = table_b.find("tfoot")
            if tfoot_b:
                rows_b.extend(tfoot_b.find_all("tr", recursive=False))
            for row in rows_b:
                cells = row.find_all(["td", "th"], recursive=False)
                if len(cells) < 23:
                    continue
                if not cells[0].get_text(strip=True):
                    b_year = 9999
                else:
                    try:
                        b_year = int(cells[0].get_text(strip=True))
                    except:
                        continue
                if cells[1].get_text(strip=True) == "通　算":
                    b_team = ""
                else:
                    b_team = cells[1].get_text(strip=True)
                games              = safe_int(cells[2].get_text())
                plate_appearances  = safe_int(cells[3].get_text())
                at_bats            = safe_int(cells[4].get_text())
                runs               = safe_int(cells[5].get_text())
                hits               = safe_int(cells[6].get_text())
                doubles            = safe_int(cells[7].get_text())
                triples            = safe_int(cells[8].get_text())
                home_runs          = safe_int(cells[9].get_text())
                total_bases        = safe_int(cells[10].get_text())
                RBI                = safe_int(cells[11].get_text())
                stolen_bases       = safe_int(cells[12].get_text())
                caught_stealing    = safe_int(cells[13].get_text())
                sacrifice_hit      = safe_int(cells[14].get_text())
                sacrifice_fly      = safe_int(cells[15].get_text())
                walks_b            = safe_int(cells[16].get_text())
                hit_by_pitch_b     = safe_int(cells[17].get_text())
                strikeouts_b       = safe_int(cells[18].get_text())
                grounded_into_dp   = safe_int(cells[19].get_text())
                batting_average    = safe_float(cells[20].get_text())
                slugging_percentage= safe_float(cells[21].get_text())
                on_base_percentage = safe_float(cells[22].get_text())
                
                try:
                    cursor.execute("DELETE FROM player_batting_stats WHERE player_id = %s AND year = %s AND team = %s", (player_id, b_year, b_team))
                    conn.commit()
                except Exception as e:
                    conn.rollback()
                    logging.error(f"打撃成績レコード削除エラー: {current_player_id} 年度 {b_year} - {e}")
                
                sql_bat = """
                INSERT INTO player_batting_stats
                  (player_id, year, team, games, plate_appearances, at_bats, runs, hits, doubles, triples, home_runs, total_bases, RBI, stolen_bases, caught_stealing, sacrifice_hit, sacrifice_fly, walks, hit_by_pitch, strikeouts, grounded_into_dp, batting_average, slugging_percentage, on_base_percentage)
                VALUES 
                  (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """
                cursor.execute(sql_bat, (
                    current_player_id, b_year, b_team, games, plate_appearances, at_bats, runs, hits, doubles,
                    triples, home_runs, total_bases, RBI, stolen_bases, caught_stealing, sacrifice_hit,
                    sacrifice_fly, walks_b, hit_by_pitch_b, strikeouts_b, grounded_into_dp, batting_average,
                    slugging_percentage, on_base_percentage
                ))
                conn.commit()
                logging.info(f"打撃成績登録完了: {current_player_id} 年度 {b_year}")
        else:
            logging.info(f"打撃成績テーブルが見つかりませんでした: {url}")
    else:
        logging.info(f"打撃成績セクションは存在しません: {url}")

    # 各選手ページの処理後に1秒待機
    time.sleep(1)

# すべての選手ページ処理終了後、DB接続をクローズ
cursor.close()
conn.close()
sys.stdout.write("\n")  # 進捗表示終了後に改行
logging.info("全選手の成績登録が完了しました。")

処理中: 7776/7776 - https://npb.jp/bis/players/23525138.html
