## 경기결과 데이터 수집

In [1]:
#!pip install selenium
#!pip install bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

### 1. Selenium 사용을 위한 준비

- Chrome Driver 설치 [Link to download](https://sites.google.com/a/chromium.org/chromedriver/downloads)

In [2]:
# 설치한 Chrome driver 경로 (exe파일)
driver = webdriver.Chrome('./chromedriver')

# open website
driver.get('https://www.koreabaseball.com/Schedule/Schedule.aspx')

### 2. Selenium을 이용하여 메뉴 선택
- Year (option value : 2020, 2019, 2018, ... , 2001)
- Month (option value : 01, 02, 03, ... , 12)
- Series (option value : 1 (시범경기), 0,9 (정규시즌), 3,4,5,7 (포스트시즌)

In [3]:
# series가 시범경기 or 포스트시즌 일 경우 month값 없음 (error 발생함)
# 순서는 year → series → month

def setYear(year) :
    year = driver.find_element_by_xpath("//select[@id='ddlYear']/option[@value='" + str(year) + "']")
    year.click()
    
def setMonth(month) :
    month = driver.find_element_by_xpath("//select[@id='ddlMonth']/option[@value='" + str(month) + "']")
    month.click()
    
def setSeries(series) :
    series = driver.find_element_by_xpath("//select[@id='ddlSeries']/option[@value='" + str(series) + "']")
    series.click()

### 3. Data crawling
1. Driver page source 받아오기
2. Column 값 crawling function
3. Data 값 crawling function
4. Crawling

In [4]:
# driver page source 받아오기
html = driver.page_source

soup = BeautifulSoup(html, 'html.parser')
#print(soup.prettify())

In [5]:
# column 값 crawling
def crawlingCol(soup) :
    ths = soup.select('table > thead > tr > th')

    match_col = []
    for th in ths:
        match_col.append(th.text)

    match_col.insert(0,"연도")
    match_col.append("시리즈")
    
    return match_col

In [6]:
# data 값 crawling
def crawlingData(soup,yy,ss) :
    trs = soup.select('table > tbody > tr')

    match_data_all = [] # 전체 데이터

    for tr in trs:
        match_data = [] # match_data 초기화 (1 row)
        
        tds = tr.find_all("td")
        
        for td in tds:
            if len(tds) > 7 : # 데이터가 없을 경우 crawling 안함
                # data crawling
                if td.text != "리뷰" :
                    match_data.append(td.text)
                else : # 선수엔트리 href 추출
                    match_data.append(td.text)
                    #print(td)

        if len(tds) == 8 : # 날짜 데이터가 없을 경우 날짜를 None으로
            match_data.insert(0,None)

        match_data.insert(0,yy)
        match_data.append(ss)
        
        match_data_all.append(match_data) # 2D list
        
    return match_data_all

In [7]:
# crawling
year = ["2018", "2019", "2020"]
month = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
series = ["3,4,5,7", "1", "0,9"]

match_col = []
match_data_all = []

match_col = crawlingCol(soup)

# 초기 메뉴 설정 (미설정시 error발생)
setYear("2019")
setSeries("0,9")
setMonth("10")


for yy in year :
    for ss in series :
        if ss == "0,9" : # 정규시즌
            for mm in month :
                setYear(yy)
                setSeries(ss)
                setMonth(mm)
                
                # driver page source 받아오기
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                
                for match_data in crawlingData(soup,yy,ss) :
                    match_data_all.append(match_data)
                
        # 2020년은 포스트시즌 & 시범경기 미실신
        elif yy != "2020" : # 시범경기 ("1") & 포스트시즌 ("3,4,5,7")
            
            setYear(yy)
            setSeries(ss)
            
            #driver page source 받아오기
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            
            for match_data in crawlingData(soup,yy,ss) :
                match_data_all.append(match_data)
            

In [8]:
# pandas의 dataframe함수 사용
# pd.DataFrame(columns=columnlist, data=datalist)
df = pd.DataFrame(columns = match_col, data = match_data_all)

df

Unnamed: 0,연도,날짜,시간,경기,게임센터,하이라이트,TV,라디오,구장,비고,시리즈
0,2018,10.03(목),14:00,NC1vs3LG,리뷰,하이라이트,S-T,,잠실,-,3457
1,2018,10.05(토),,,,,,,,이동일,3457
2,2018,10.06(일),14:00,LG0vs1키움,리뷰,하이라이트,M-T,,고척,-,3457
3,2018,10.07(월),18:30,LG4vs5키움,리뷰,하이라이트,M-T,,고척,-,3457
4,2018,10.08(화),,,,,,,,이동일,3457
...,...,...,...,...,...,...,...,...,...,...,...
2475,2020,,18:30,KIAvs롯데,,,,,사직,-,09
2476,2020,,18:30,NCvs삼성,,,,,대구,-,09
2477,2020,,18:30,KTvs한화,,,,,대전,-,09
2478,2020,09,,,,,,,,,


### 4. CSV 파일로 저장

In [9]:
df.to_csv("match_data.csv")