# 라이브러리 설치 및 Import

In [None]:
!pip3 install selenium webdriver_manager beautifulsoup4 pandas
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd

# 함수설정

In [None]:
# 웹 드라이버 초기화
options = webdriver.ChromeOptions()
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# 크롤링 -> soup 리턴
def get_soup(url):
    # 페이지 접속
    try:
        driver.get(url)
        sleep(3) # 로딩 대기
    except Exception as e: 
        print(e)

    # BeautifulSoup 객체 생성
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    return soup

# 파일 path 기반 csv -> dataFrame
def reading_csv(file_path):
    encodings = ['cp949', 'utf-8', 'ISO-8859-1', 'cp1252']  # List of encodings to try
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df  # or process your df
        except: pass
    return None


# 선수별 ID조회

In [None]:
# 기존 저장된 ID조회
PATH_ID_TXT = "txt_data/id.txt"
players = set()
with open(PATH_ID_TXT, 'r', encoding='utf-8') as f:
    for id in f.readlines():
        players.add(id.strip())
f.close()

# 크롤링을 통해, ID조회
def get_player_id():
    id_collection = set()
    url = 'https://statiz.sporki.com/stats/?m=main&m2=pitching&m3=default&so=&ob=&year=2024&sy=2013&ey=2023&te=&po=&lt=10100&reg=R&pe=&ds=&de=&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=1000&ph=&hs=&us=&na=&ls=0&sf1=G&sk1=&sv1=&sf2=G&sk2=&sv2=-25'

    soup = get_soup(url)
    table = soup.find('table')
    for tr in table.find_all('tr'):
        try:
            td = tr.find_all("td", {"style":"text-align:left;"})[0]
            a = td.find('a')
            href = a['href']
            id = href.split('p_no=')[1]
            id_collection.add(id)
        except Exception as e:
            pass

    return id_collection

# ID 저장
with open(PATH_ID_TXT, 'a', encoding='utf-8') as f:
    crawling_id = get_player_id()
    for id in crawling_id:
        if not id in players:
            players.add(id)
            f.write(id+'\n')
    players = crawling_id
f.close()


# 선수 정보 조회

In [None]:
BASE_URL = 'https://statiz.sporki.com/player/?m=analysis&p_no='

# ID 값 기반 선수 정보 조회
def get_player_info(id):
    # 웹 크롤링
    url = f'{BASE_URL}{id}'
    soup = get_soup(url)

    # 정보 조회
    name = soup.find('div', {'class':'t_name'}).text.strip()
    box = {}
    for li in soup.find('ul', {'class':'profile'}).find_all('li'):
        key = li.find('span').text.strip()
        value = li.find('em').text.strip()
        box[key] = value    
    
    birth_year = int(box['생년월일'].split('년')[0])
    throwingHand = 0 if box['투타'][0] == '좌' else 1

    a,b = box['활약연도'].split(' ~ ')
    begin, end = int(a.split('년')[0]), int(b.split('년')[0])

    return [id, name, birth_year, throwingHand, begin, end]


In [None]:
# ID값 기반 시즌별 정보 조회
PATH_PLAYER_INFO_CSV = 'csv_data/player_info.csv'
if os.path.exists(PATH_PLAYER_INFO_CSV):
    df_data = reading_csv(PATH_PLAYER_INFO_CSV)
else:
    data_box = [['ID', 'NAME', 'BIRTH_YEAR', 'THROWING_HAND', 'BEGIN', 'END']]
    for i, id in enumerate(players):
        progress = f'{100*(i/len(players)):.2f}%'
        print(f'\rProgress: {progress}...', end = '')
        try:
            data_box.append(get_player_info(id))
        except:
            pass

    df_data = pd.DataFrame(data_box[1:], columns=data_box[0])
    df_data.to_csv(PATH_PLAYER_INFO_CSV)


# 정보 데이터 프레임 생성
player_infos = []
for i in range(len(df_data)):
    data = df_data.iloc()[i]
    player_infos.append({
        'ID': data['ID'],
        'NAME': data['NAME'], 
        'THROWING_HAND': data['THROWING_HAND'],
        'BEGIN':data['BEGIN'],
        'END':data['END']        
        })

df_data

# 시즌별 성적 및 투구 유형 조회

In [None]:
# 시즌, ID값 기반 투구 유형 조회 (구종 구성)
def pitch_composition(id, year):
    # 크롤링
    url = f'https://statiz.sporki.com/player/?m=analysis&p_no={id}&pos=pitching&year={year}&si1=1&si2=1'
    soup = get_soup(url)

    # 구종 구성 조회
    table = soup.find('table')
    tr_box = table.find_all('tr')
    headers = [th.text.strip() for th in tr_box[0].find_all('th')]
    values = []
    for th in tr_box[1].find_all('td'):
        try:
            values.append(float(th.text.strip()))
        except:
            values.append(th.text.strip())
    if len(headers) == len(values):
        composition = {head: v for head, v in zip(headers, values)}
        composition.pop('상대팀')
        return composition
    else:
        return None

# 시즌, ID값 기반 투구 유형 조회 (구종 속도)
def pitch_speed(id, year):
    # 크롤링
    url = f'https://statiz.sporki.com/player/?m=analysis&p_no={id}&pos=pitching&year={year}&si1=1&si2=2'
    soup = get_soup(url)

    # 구종별 속도 조회
    table = soup.find('table')
    tr_box = table.find_all('tr')
    headers = [th.text.strip() for th in tr_box[0].find_all('th')]
    values = []
    for th in tr_box[1].find_all('td'):
        try:
            values.append(float(th.text.strip()))
        except:
            values.append(0)
    if len(headers) == len(values):
        composition = {head: v for head, v in zip(headers, values)}
        composition.pop('상대팀')
        return composition
    else:
        return None

# 시즌, ID값 기반 WAR 성적 조회
def pitch_stats(id):
    # 크롤링
    url = f'https://statiz.sporki.com/player/?m=year&m2=pitching&m3=default&p_no={id}&lt=10100&gc='
    soup = get_soup(url)

    # WAR
    table = soup.find('table')
    tbody = table.find('tbody')
    stats = {}
    for tr in tbody.find_all('tr'):
        row = []
        for td in tr.find_all('td'):
            try:
                row.append(float(td.text.strip()))
            except:
                row.append(td.text.strip())
        year, war = int(row[0]), row[-1]
        stats[year] = war
    return stats

시즌별 선수 정보 데이터 csv파일 생성

In [None]:
PATH_PLAYER_STAT_CSV = "csv_data/player_stats.csv"
if os.path.exists(PATH_PLAYER_STAT_CSV):
    baseball_data = reading_csv(PATH_PLAYER_STAT_CSV)
else:
    data_box = []
    # 각 플레이어 조회
    for player in player_infos:
        # ID값, 데뷔 연도, 최신 기록 연도
        id = player['ID']
        begin, end = max(player['BEGIN'], 2013), min(player['END'], 2023)
        hand = player['THROWING_HAND']
        try:
            stats = pitch_stats(id)
        except:
            continue
        
        # 연도별 조회
        for year in range(begin, end + 1):
            try:
                # 데이터 조회
                composition = pitch_composition(id, year)
                speed = pitch_speed(id, year)
                headers = ['YEAR']
                values = [year]

                # 구종 구성
                for head, value in composition.items():
                    if head != '시즌':
                        head = 'RATE_' + head
                        headers.append(head)
                        values.append(value)

                # 구종별 속도
                for head, value in speed.items():
                    if head != '시즌':
                        head = 'SPEED_' + head
                        headers.append(head)
                        values.append(value)           

                # WAR
                headers.append('WAR')
                values.append(stats[year])
                print(headers)
                print(values)

                if len(values) == len(headers):
                    if len(data_box) == 0:
                        data_box.append(headers)
                    data_box.append(values)
            except Exception as e:
                print(e)
                pass

    # 저장
    baseball_data = pd.DataFrame(data_box[1:], columns=data_box[0])
    baseball_data.to_csv(PATH_PLAYER_STAT_CSV, index=False)

baseball_data

