In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json
import warnings
from tqdm import tqdm
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# 2020년대 트레이드 내역 데이터 불러오기
with open("player_moves/trade_list_2020s.json") as f:
    trades = json.load(f)

# 2020년데 지명권 트레이드 결과 데이터 불러오기
draft_tickets = pd.read_csv("player_moves/draft_tickets_2020s.csv", keep_default_na=False)

In [3]:
draft_tickets.head(3)

Unnamed: 0,id,statizId,팀,지명라운드,선수,포지션
0,9,15049,롯데,22-2-3,김세민,내야수
1,10,15027,키움,22-2-4,노운현,투수
2,15,15055,NC,22-2-4,조효원,내야수


#### 선수 기본 정보 크롤링

In [4]:
player_info_df = []

for trade in tqdm(trades):
    for x in trade["playerA"] + trade["playerB"]:
        if x["type"] == "money":
            continue

        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            
            if player_id == '':
                continue

        else:
            name = x["name"]
            player_id = x["statizId"]

        player_url = f"https://statiz.sporki.com/player/?m=playerinfo&p_no={player_id}"
        driver = webdriver.Chrome()
        driver.get(player_url)
        time.sleep(0.5)

        info1 = tuple(map(lambda x : x.text, driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.player_info_header > div.bio.type01 > div.in_box > div.p_info > div.con > span")))
        team, position, tuta = info1
        
        info2 = tuple(map(lambda x : x.text[x.text.find('\n')+1:], driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.player_info_header > div.bio.type01 > div.in_box > ul > li")))
        birth, school, draft, career, team_history = info2

        player_info_df.append([player_id, name, team, position, tuta, birth, school, draft, career, team_history])

        driver.quit()

100%|██████████| 40/40 [16:07<00:00, 24.18s/it]


In [5]:
player_info_df = \
    pd.DataFrame(player_info_df, 
    columns=["statizId", "이름", "팀", "주포지션", "투타", "생년월일", "출신학교", "신인지명", "활약년도", "활약팀"])

player_info_df = player_info_df.drop_duplicates()
player_info_df.to_csv("player_stats/player_basic_info.csv")

# player_info_df = pd.read_csv("player_stats/player_basic_info.csv")
# player_info_df["statizId"] = player_info_df["statizId"].apply(lambda x : str(x))

#### 연도별 정규시즌 스탯 크롤링

In [6]:
batter_header_url = "https://statiz.sporki.com/player/?m=year&m2=batting&m3=default&p_no=10475&lt=10100&gc="
pitcher_header_url = "https://statiz.sporki.com/player/?m=year&m2=pitching&m3=default&p_no=10475&lt=10100&gc="
fielder_header_url = "https://statiz.sporki.com/player/?m=year&m2=fielding&m3=default&p_no=10475&lt=10100&gc="

# 연도별 - 타격 데이터 헤더 수집
driver = webdriver.Chrome()
driver.get(batter_header_url)
batter_header1 = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr:nth-child(1) > th")
batter_header2 = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr:nth-child(2) > th")
batter_header = [x.text for x in (batter_header1[:-2] + batter_header2 + [batter_header1[-1]])]
driver.quit()

# 연도별 - 투구 데이터 헤더 수집
driver = webdriver.Chrome()
driver.get(pitcher_header_url)
pitcher_header = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr:nth-child(1) > th")
pitcher_header= [x.text for x in pitcher_header]
driver.quit()

# 연도별 - 수비 데이터 헤더 수집
driver = webdriver.Chrome()
driver.get(fielder_header_url)
fielder_header1 = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr:nth-child(1) > th")
fielder_header2 = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr:nth-child(2) > th")
fielder_header = [x.text for x in (fielder_header1[:-5] + fielder_header2 + fielder_header1[-4:])]

driver.quit()

In [7]:
batter = ["1B", "2B", "3B", "SS", "LF", "CF", "RF", "C"]

for trade in tqdm(trades):
    for x in trade["playerA"] + trade["playerB"]:
        batter_stat_df = []
        pitcher_stat_df = []
        fielder_stat_df = []

        if x["type"] == "money":
            continue

        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            
            if player_id == '':
                continue

        else:
            player_id = x["statizId"]
            name = x["name"]

        if player_info_df.loc[player_info_df["statizId"] == player_id, "주포지션"].item() in batter:
            batter_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=batting&m3=default&p_no={player_id}&lt=10100&gc="

            # 연도별 - 정규시즌 타격 데이터 불러오기
            driver = webdriver.Chrome()
            driver.get(batter_stats_url)
            batter_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
            batter_stats_tbody = batter_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
            for index, value in enumerate(batter_stats_tbody):
                td = value.find_elements(By.CSS_SELECTOR, "td")
                if len(td) == 32:
                    batter_stat_df.append([''] + [x.text for x in td])
                else:
                    batter_stat_df.append([x.text for x in td])

            batter_stat_df = pd.DataFrame(batter_stat_df, columns=batter_header)
            batter_stat_df.loc[batter_stat_df["Year"] == "", "Year"] = None
            batter_stat_df["Year"] = batter_stat_df["Year"].fillna(method="ffill")
            batter_stat_df.to_csv(f"player_stats/annual_stats/regular/batting_stats/{name}_{player_id}_annualStats.csv")
            time.sleep(0.5)
            driver.quit()

        else:
            pitcher_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=pitching&m3=default&p_no={player_id}&lt=10100&gc="

            # 연도별 - 정규시즌 투구 데이터 불러오기
            driver = webdriver.Chrome()
            driver.get(pitcher_stats_url)
            pitcher_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
            pitcher_stats_tbody = pitcher_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
            for index, value in enumerate(pitcher_stats_tbody):
                td = value.find_elements(By.CSS_SELECTOR, "td")
                if len(td) == 35:
                    pitcher_stat_df.append([''] + [x.text for x in td])
                else:
                    pitcher_stat_df.append([x.text for x in td])

            pitcher_stat_df = pd.DataFrame(pitcher_stat_df, columns=pitcher_header)
            pitcher_stat_df.loc[pitcher_stat_df["Year"] == "", "Year"] = None
            pitcher_stat_df["Year"] = pitcher_stat_df["Year"].fillna(method="ffill")
            pitcher_stat_df.to_csv(f"player_stats/annual_stats/regular/pitching_stats/{name}_{player_id}_annualStats.csv")
            time.sleep(0.5)
            driver.quit()    

        # 연도별 - 정규시즌 수비 데이터 불러오기
        fielder_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=fielding&m3=default&p_no={player_id}&lt=10100&gc="
        driver = webdriver.Chrome()
        driver.get(fielder_stats_url)
        fielder_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
        fielder_stats_tbody = fielder_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
        for index, value in enumerate(fielder_stats_tbody):
            td = value.find_elements(By.CSS_SELECTOR, "td")
            if len(td) == 26:
                fielder_stat_df.append([''] + [x.text for x in td])
            else:
                fielder_stat_df.append([x.text for x in td])
        
        fielder_stat_df = pd.DataFrame(fielder_stat_df, columns=fielder_header)
        fielder_stat_df.loc[fielder_stat_df["Year"] == "", "Year"] = None
        fielder_stat_df["Year"] = fielder_stat_df["Year"].fillna(method="ffill")
        fielder_stat_df.to_csv(f"player_stats/annual_stats/regular/fielding_stats/{name}_{player_id}_annualStats.csv")
        time.sleep(0.5)
        driver.quit()      

100%|██████████| 40/40 [37:51<00:00, 56.79s/it]


#### 연도별 포스트시즌 스탯 크롤링

In [11]:
batter_header_post = batter_header[:2] + ["Series"] + batter_header[2:]
pitcher_header_post = pitcher_header[:2] + ["Series"] + pitcher_header[2:]
fielder_header_post = fielder_header[:2] + ["Series"] + fielder_header[2:]

In [15]:
batter = ["1B", "2B", "3B", "SS", "LF", "CF", "RF", "C"]

for trade in tqdm(trades):
    for x in trade["playerA"] + trade["playerB"]:
        batter_stat_df = []
        pitcher_stat_df = []
        fielder_stat_df = []

        if x["type"] == "money":
            continue

        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            
            if player_id == '':
                continue

        else:
            player_id = x["statizId"]
            name = x["name"]

        if player_info_df.loc[player_info_df["statizId"] == player_id, "주포지션"].item() in batter:
            batter_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=batting&m3=default&p_no={player_id}&lt=102%&gc="

            # 연도별 - 포스트시즌 타격 데이터 불러오기
            driver = webdriver.Chrome()
            driver.get(batter_stats_url)
            batter_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
            batter_stats_tbody = batter_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
            for index, value in enumerate(batter_stats_tbody):
                td = value.find_elements(By.CSS_SELECTOR, "td")
                if len(td) == 33:
                    batter_stat_df.append([''] + [x.text for x in td])
                else:
                    batter_stat_df.append([x.text for x in td])

            batter_stat_df = pd.DataFrame(batter_stat_df, columns=batter_header_post)
            batter_stat_df.loc[batter_stat_df["Year"] == "", "Year"] = None
            batter_stat_df["Year"] = batter_stat_df["Year"].fillna(method="ffill")
            batter_stat_df.to_csv(f"player_stats/annual_stats/postseason/batting_stats/{name}_{player_id}_annualStats.csv")
            time.sleep(0.5)
            driver.quit()

        else:
            pitcher_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=pitching&m3=default&p_no={player_id}&lt=102%&gc="

            # 연도별 - 포스트시즌 투구 데이터 불러오기
            driver = webdriver.Chrome()
            driver.get(pitcher_stats_url)
            pitcher_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
            pitcher_stats_tbody = pitcher_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
            for index, value in enumerate(pitcher_stats_tbody):
                td = value.find_elements(By.CSS_SELECTOR, "td")
                if len(td) == 36:
                    pitcher_stat_df.append([''] + [x.text for x in td])
                else:
                    pitcher_stat_df.append([x.text for x in td])

            pitcher_stat_df = pd.DataFrame(pitcher_stat_df, columns=pitcher_header_post)
            pitcher_stat_df.loc[pitcher_stat_df["Year"] == "", "Year"] = None
            pitcher_stat_df["Year"] = pitcher_stat_df["Year"].fillna(method="ffill")
            pitcher_stat_df.to_csv(f"player_stats/annual_stats/postseason/pitching_stats/{name}_{player_id}_annualStats.csv")
            time.sleep(0.5)
            driver.quit()    

        # 연도별 - 포스트시즌 수비 데이터 불러오기
        fielder_stats_url = f"https://statiz.sporki.com/player/?m=year&m2=fielding&m3=default&p_no={player_id}&lt=102%&gc="
        driver = webdriver.Chrome()
        driver.get(fielder_stats_url)
        fielder_stats_table = driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
        fielder_stats_tbody = fielder_stats_table.find_elements(By.CSS_SELECTOR, "table > tbody:nth-child(2) > tr")
        for index, value in enumerate(fielder_stats_tbody):
            td = value.find_elements(By.CSS_SELECTOR, "td")
            if len(td) == 27:
                fielder_stat_df.append([''] + [x.text for x in td])
            else:
                fielder_stat_df.append([x.text for x in td])
        
        fielder_stat_df = pd.DataFrame(fielder_stat_df, columns=fielder_header_post)
        fielder_stat_df.loc[fielder_stat_df["Year"] == "", "Year"] = None
        fielder_stat_df["Year"] = fielder_stat_df["Year"].fillna(method="ffill")
        fielder_stat_df.to_csv(f"player_stats/annual_stats/postseason/fielding_stats/{name}_{player_id}_annualStats.csv")
        time.sleep(0.5)
        driver.quit()

100%|██████████| 40/40 [34:28<00:00, 51.71s/it]


#### 날짜별 스탯 크롤링

In [100]:
batter_header_url = "https://statiz.sporki.com/player/?m=day&p_no=10475&pos=batting&year=2024"
pitcher_header_url = "https://statiz.sporki.com/player/?m=day&p_no=10652&pos=pitching&year=2024"

# 날짜별 - 타격 데이터 헤더 수집
driver = webdriver.Chrome()
driver.get(batter_header_url)
batter_header = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr > th")
batter_header = ["날짜"] + ["경기타입"] + ["홈경기"] + ["상대"] + ["승패"] + [x.text for x in batter_header][2:]
driver.quit()

# 날짜별 - 투구 데이터 헤더 수집
driver = webdriver.Chrome()
driver.get(pitcher_header_url)
pitcher_header = driver.find_elements(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table > thead:nth-child(1) > tr > th")
pitcher_header = ["날짜"] + ["경기타입"] + ["홈경기"] + ["상대"] + ["승패"] + [x.text for x in pitcher_header][2:]
driver.quit()

In [123]:
batter = ["1B", "2B", "3B", "SS", "LF", "CF", "RF", "C"]

for idx, trade in tqdm(enumerate(trades)):
    for x in trade["playerA"] + trade["playerB"]:
        batter_stat_df = []
        pitcher_stat_df = []

        if x["type"] == "money":
            continue

        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            
            if player_id == '':
                continue

        else:
            player_id = x["statizId"]
            name = x["name"]

        if player_info_df.loc[player_info_df["statizId"] == player_id, "주포지션"].item() in batter:
            batter_stats_url = f"https://statiz.sporki.com/player/?m=day&p_no={player_id}&pos=batting&year=2024"
            driver = webdriver.Chrome()
            driver.get(batter_stats_url)
            batter_years = [x.get_attribute('innerText') for x in driver.find_elements(By.CSS_SELECTOR, "#select_year > ul > li")]
            driver.quit()
            for year in batter_years:
                batter_stats_url = f"https://statiz.sporki.com/player/?m=day&p_no={player_id}&pos=batting&year={year}"
                driver = webdriver.Chrome()
                driver.get(batter_stats_url)
                yearly_batter_stats_table = \
                    driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
                yearly_batter_stats_rows = yearly_batter_stats_table.find_elements(By.XPATH, "./*")
                for tag in yearly_batter_stats_rows:
                    if tag.tag_name == "thead":
                        game_type = tag.find_element(By.CSS_SELECTOR, "tr > th:nth-child(1)").text
                        game_type = game_type[game_type.find("-")+2:]
                    elif tag.tag_name == "tbody":
                        game_stats_rows = tag.find_elements(By.CSS_SELECTOR, "tr")
                        for row in game_stats_rows:
                            td = [x.text for x in row.find_elements(By.CSS_SELECTOR, "td")]
                            td = [str(year) + '-' + str(td[0])] \
                                + [game_type] \
                                + ["0" if td[1][0] == "@" else "1"] \
                                + [td[1][1:] if td[1][0] == "@" else td[1][0:]] \
                                + ["승리" if td[2][0] == "W" else "패배"] \
                                + [td[2][1:]] \
                                + td[3:]
                            batter_stat_df.append(td)
            batter_stat_df = pd.DataFrame(batter_stat_df, columns=batter_header)
            batter_stat_df = batter_stat_df.sort_values(by=["날짜"])
            batter_stat_df_SS = batter_stat_df[batter_stat_df["경기타입"] == "SS"].reset_index(drop=True)
            batter_stat_df_PS = batter_stat_df[batter_stat_df["경기타입"] != "SS"].reset_index(drop=True)
            batter_stat_df_SS.to_csv(f"player_stats/daily_stats/regular/batting_stats/{name}_{player_id}_dailyStats.csv")
            batter_stat_df_PS.to_csv(f"player_stats/daily_stats/postseason/batting_stats/{name}_{player_id}_dailyStats.csv")
            driver.quit()

        else:
            pitcher_stats_url = f"https://statiz.sporki.com/player/?m=day&p_no={player_id}&pos=pitching&year=2024"
            driver = webdriver.Chrome()
            driver.get(pitcher_stats_url)
            pitcher_years = [x.get_attribute('innerText') for x in driver.find_elements(By.CSS_SELECTOR, "#select_year > ul > li")]
            driver.quit()

            for year in pitcher_years:
                pitcher_stats_url = f"https://statiz.sporki.com/player/?m=day&p_no={player_id}&pos=pitching&year={year}"
                driver = webdriver.Chrome()
                driver.get(pitcher_stats_url)   
                yearly_pitcher_stats_table = \
                    driver.find_element(By.CSS_SELECTOR, "body > div.warp > div.container > section > div.table_type02.transverse_scroll.cbox > table")
                yearly_pitcher_stats_rows = yearly_pitcher_stats_table.find_elements(By.XPATH, "./*")
                for tag in yearly_pitcher_stats_rows:
                    if tag.tag_name == "thead":
                        game_type = tag.find_element(By.CSS_SELECTOR, "tr > th:nth-child(1)").text
                        game_type = game_type[game_type.find("-")+2:]
                    elif tag.tag_name == "tbody":
                        game_stats_rows = tag.find_elements(By.CSS_SELECTOR, "tr")
                        for row in game_stats_rows:
                            td = [x.text for x in row.find_elements(By.CSS_SELECTOR, "td")]
                            td = [str(year) + '-' + str(td[0])] \
                                + [game_type] \
                                + ["0" if td[1][0] == "@" else "1"] \
                                + [td[1][1:] if td[1][0] == "@" else td[1][0:]] \
                                + ["승리" if td[2][0] == "W" else "패배"] \
                                + [td[2][1:]] \
                                + td[3:]
                            pitcher_stat_df.append(td)

            pitcher_stat_df = pd.DataFrame(pitcher_stat_df, columns=pitcher_header)
            pitcher_stat_df = pitcher_stat_df.sort_values(by=["날짜"])
            pitcher_stat_df_SS = pitcher_stat_df[pitcher_stat_df["경기타입"] == "SS"].reset_index(drop=True)
            pitcher_stat_df_PS = pitcher_stat_df[pitcher_stat_df["경기타입"] != "SS"].reset_index(drop=True)
            pitcher_stat_df_SS.to_csv(f"player_stats/daily_stats/regular/pitching_stats/{name}_{player_id}_dailyStats.csv")
            pitcher_stat_df_PS.to_csv(f"player_stats/daily_stats/postseason/pitching_stats/{name}_{player_id}_dailyStats.csv")
            driver.quit()

3it [22:36, 452.27s/it]
