In [44]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup
import io

import time

import pandas as pd
import numpy as np


In [131]:
def extract_data(driver, year):
    dfs = pd.read_html(io.StringIO(driver.page_source))
    df = pd.concat([dfs[0],dfs[1]],axis = 1)
    if df.loc[0, '선수'] == '데이터가 없습니다.':
        return pd.DataFrame()
    
    df = df.rename(columns = {"순위":"연도"})
    df['연도'] = year
    return df

# pitcher data crawling

In [134]:
# 2i tracking website url
url = "https://m.2itracking.com/Default.aspx"

driver = webdriver.Chrome()

driver.get(url)
time.sleep(3)

toggle_box = driver.find_element(By.XPATH, '//*[@id="btnHamburger"]')
toggle_box.click()
time.sleep(3)

player_rank_box = driver.find_element(By.XPATH, '//*[@id="dvMenu"]/ul/li[2]/a')
player_rank_box.click()
driver.implicitly_wait(3)

final_extracted_pitcher_df = pd.DataFrame()
for year_idx in range(1, 4):
    year = 2024-year_idx
    # select year
    year_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlSeason"]'))
    year_box.select_by_index(year_idx)
    driver.implicitly_wait(3)

    team_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlTeam"]'))
    team_box.select_by_index(1) # all teams
    driver.implicitly_wait(3)

    for pitch_kind_idx in range(2, 12):
        pitch_kind_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlPitKind"]'))
        pitch_kind_box.select_by_index(pitch_kind_idx) # fast ball
        driver.implicitly_wait(3)

        pitch_count_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlPitchCount"]'))
        pitch_count_box.select_by_index(3) # pitched the ball more than 100 times
        driver.implicitly_wait(3)

        # find the pitcher data
        search_box = driver.find_element(By.XPATH, '//*[@id="btnSearch"]')
        search_box.click()
        time.sleep(3)
        
        extraced_pitcher_data = extract_data(driver, year)
        final_extracted_pitcher_df = pd.concat([final_extracted_pitcher_df,
                                                extraced_pitcher_data],
                                                axis = 0)



In [136]:
final_extracted_pitcher_df = final_extracted_pitcher_df.reset_index(drop = True)
final_extracted_pitcher_df

Unnamed: 0,연도,선수,팀,구종,구사%,구속,상하 무브먼트,좌우 무브먼트,분당 회전수,좌우 릴리스포인트,상하 릴리스포인트,타구 속도,상하 각도,피안타율,피장타율,땅볼%,라인드라이브%,뜬공%,팝플라이%,강한 타구%
0,2023,김서준,삼성,직구,78.2,141.6,24.9,-24.9,2622.8,-60.3,163.5,134.8,27.2,0.320,0.400,13.0,52.2,4.3,30.4,17.4
1,2023,김동규,키움,직구,76.2,144.4,31.0,-12.1,2510.5,-11.7,184.1,131.3,21.3,0.364,0.636,27.3,9.1,54.5,9.1,18.2
2,2023,김승현,KIA,직구,74.7,144.9,22.7,-3.5,1753.2,-58.9,170.4,129.2,24.2,0.257,0.400,30.8,26.9,19.2,23.1,19.2
3,2023,김기훈,KIA,직구,74.2,143.7,33.7,14.1,2722.3,46.4,180.1,136.1,27.4,0.222,0.278,30.2,19.0,22.2,28.6,22.2
4,2023,김재영,한화,직구,71.7,137.4,6.2,-26.4,1973.6,-90.3,111.3,129.5,21.0,0.370,0.630,25.0,35.0,25.0,15.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,2021,쿠에바스,KT,커터,24.3,138.6,18.5,4.1,1483.2,-53.0,171.6,128.5,15.4,0.289,0.471,35.2,29.5,28.4,6.8,20.5
1636,2021,몽고메리,삼성,커터,23.9,138.2,13.0,10.1,1258.7,76.7,191.2,131.8,16.7,0.164,0.200,34.8,34.8,13.0,17.4,4.3
1637,2021,곽빈,두산,커터,17.7,135.9,12.8,-2.3,998.1,-48.9,178.7,132.2,17.6,0.268,0.439,32.3,35.5,16.1,16.1,22.6
1638,2021,김대우,롯데,커터,15.2,139.0,15.4,-10.6,1435.3,-61.6,169.4,133.5,9.4,0.267,0.267,35.7,42.9,14.3,7.1,14.3


In [137]:
# final_extracted_pitcher_df.to_csv("./datasets/pitcher_more_detailed_stas.csv")

In [145]:
new_col_name_list = []
for col_name in final_extracted_pitcher_df.columns:
    new_col_name_list.append(col_name.replace(" ", "_"))
final_extracted_pitcher_df.columns = new_col_name_list

In [155]:
fast_ball_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='직구']
fast_ball_data.columns = fast_ball_data.columns[:4].append("직구_"+fast_ball_data.columns[4:])
fast_ball_data = fast_ball_data.reset_index(drop = True)
fast_ball_data

Unnamed: 0,연도,선수,팀,구종,직구_구사%,직구_구속,직구_상하_무브먼트,직구_좌우_무브먼트,직구_분당_회전수,직구_좌우_릴리스포인트,직구_상하_릴리스포인트,직구_타구_속도,직구_상하_각도,직구_피안타율,직구_피장타율,직구_땅볼%,직구_라인드라이브%,직구_뜬공%,직구_팝플라이%,직구_강한_타구%
0,2023,김서준,삼성,직구,78.2,141.6,24.9,-24.9,2622.8,-60.3,163.5,134.8,27.2,0.320,0.400,13.0,52.2,4.3,30.4,17.4
1,2023,김동규,키움,직구,76.2,144.4,31.0,-12.1,2510.5,-11.7,184.1,131.3,21.3,0.364,0.636,27.3,9.1,54.5,9.1,18.2
2,2023,김승현,KIA,직구,74.7,144.9,22.7,-3.5,1753.2,-58.9,170.4,129.2,24.2,0.257,0.400,30.8,26.9,19.2,23.1,19.2
3,2023,김기훈,KIA,직구,74.2,143.7,33.7,14.1,2722.3,46.4,180.1,136.1,27.4,0.222,0.278,30.2,19.0,22.2,28.6,22.2
4,2023,김재영,한화,직구,71.7,137.4,6.2,-26.4,1973.6,-90.3,111.3,129.5,21.0,0.370,0.630,25.0,35.0,25.0,15.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,2021,브룩스,KIA,직구,18.1,148.5,23.5,-17.9,2292.3,-65.4,187.9,135.3,13.6,0.347,0.429,34.4,37.5,21.9,6.3,25.0
570,2021,정찬헌,키움,직구,12.0,138.3,23.9,-21.8,2381.1,-23.4,173.2,139.7,22.7,0.347,0.600,23.4,42.2,14.1,20.3,25.0
571,2021,소형준,KT,직구,10.6,141.0,23.3,-13.3,2033.7,-38.2,171.4,140.6,13.2,0.375,0.600,48.4,22.6,16.1,12.9,32.3
572,2021,고영표,KT,직구,10.6,134.8,-3.4,-23.7,1729.1,-75.8,109.5,139.9,1.8,0.284,0.343,62.8,30.2,4.7,2.3,32.6


In [156]:
fast_ball_data.to_csv("./datasets/pitcher_fast_ball_tracking_data.csv")

In [158]:
curve_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='커브']
curve_data.columns = curve_data.columns[:4].append("커브_"+curve_data.columns[4:])
curve_data = curve_data.reset_index(drop = True)
curve_data

Unnamed: 0,연도,선수,팀,구종,커브_구사%,커브_구속,커브_상하_무브먼트,커브_좌우_무브먼트,커브_분당_회전수,커브_좌우_릴리스포인트,커브_상하_릴리스포인트,커브_타구_속도,커브_상하_각도,커브_피안타율,커브_피장타율,커브_땅볼%,커브_라인드라이브%,커브_뜬공%,커브_팝플라이%,커브_강한_타구%
0,2023,우규민,삼성,커브,42.6,116.5,0.0,14.4,912.4,-99.8,109.1,128.8,27.5,0.226,0.345,19.4,25.8,27.4,27.4,6.5
1,2023,양현,키움,커브,41.6,118.1,11.0,11.2,1041.9,-94.0,95.2,131.6,22.1,0.333,0.462,22.6,38.7,19.4,19.4,22.6
2,2023,한현희,롯데,커브,37.4,126.3,-0.6,7.8,624.7,-96.0,132.5,130.7,20.5,0.244,0.325,31.0,31.0,23.0,15.0,11.0
3,2023,이채호,KT,커브,36.4,116.2,8.2,20.2,1364.4,-67.6,78.7,126.3,25.5,0.138,0.345,22.7,27.3,36.4,13.6,13.6
4,2023,신정락,롯데,커브,33.9,117.6,5.0,24.7,1556.1,-73.1,117.0,119.5,34.2,0.139,0.389,15.0,10.0,45.0,30.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,2021,이태양,SSG,커브,6.5,113.9,-14.8,7.7,1024.6,-21.7,199.9,125.8,20.1,0.154,0.231,18.2,36.4,36.4,9.1,0.0
192,2021,원태인,삼성,커브,6.0,115.6,-6.0,3.9,597.3,-33.3,182.6,132.3,20.2,0.303,0.515,41.4,24.1,13.8,20.7,13.8
193,2021,백정현,삼성,커브,6.0,114.3,-3.8,-3.6,575.7,53.5,182.0,134.4,18.1,0.387,0.871,26.9,34.6,34.6,3.8,34.6
194,2021,스트레일리,롯데,커브,5.9,122.2,-18.1,15.3,1545.5,-72.3,190.6,134.7,12.5,0.167,0.367,42.9,28.6,21.4,7.1,28.6


In [159]:
curve_data.to_csv("./datasets/pitcher_curve_tracking_data.csv")

In [160]:
slider_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='슬라이더']
slider_data.columns = slider_data.columns[:4].append("슬라이더_"+slider_data.columns[4:])
slider_data = slider_data.reset_index(drop = True)
slider_data

Unnamed: 0,연도,선수,팀,구종,슬라이더_구사%,슬라이더_구속,슬라이더_상하_무브먼트,슬라이더_좌우_무브먼트,슬라이더_분당_회전수,슬라이더_좌우_릴리스포인트,슬라이더_상하_릴리스포인트,슬라이더_타구_속도,슬라이더_상하_각도,슬라이더_피안타율,슬라이더_피장타율,슬라이더_땅볼%,슬라이더_라인드라이브%,슬라이더_뜬공%,슬라이더_팝플라이%,슬라이더_강한_타구%
0,2023,이준영,KIA,슬라이더,73.1,131.1,8.3,-0.2,674.5,31.6,185.5,127.5,24.6,0.218,0.267,22.6,38.7,17.7,21.0,9.7
1,2023,임준섭,SSG,슬라이더,54.4,127.3,-1.2,-3.3,430.7,36.8,192.1,137.5,6.8,0.296,0.338,51.3,35.9,12.8,0.0,25.6
2,2023,고효준,SSG,슬라이더,51.7,129.9,2.2,-0.5,443.0,51.9,172.8,133.9,19.1,0.265,0.345,31.0,31.0,26.8,11.3,18.3
3,2023,김영현,KT,슬라이더,49.4,130.4,11.4,5.3,946.7,-31.5,167.6,136.3,17.6,0.200,0.277,31.4,28.6,31.4,8.6,20.0
4,2023,진해수,LG,슬라이더,47.3,126.7,2.7,-3.0,525.1,55.5,194.2,134.8,18.6,0.267,0.367,23.5,47.1,23.5,5.9,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,2021,미란다,두산,슬라이더,7.0,130.4,10.3,3.8,844.1,33.2,187.0,135.1,18.7,0.233,0.267,30.0,35.0,20.0,15.0,20.0
388,2021,고영표,KT,슬라이더,6.6,127.8,7.0,-4.8,742.4,-77.6,107.4,126.2,20.1,0.267,0.467,31.8,22.7,31.8,13.6,9.1
389,2021,김민우,한화,슬라이더,6.5,125.9,15.9,8.3,1288.0,-33.6,187.2,130.9,26.9,0.209,0.395,17.9,28.6,35.7,17.9,14.3
390,2021,요키시,키움,슬라이더,6.1,136.8,16.5,7.1,1361.8,68.3,177.8,137.4,13.6,0.379,0.793,42.9,28.6,23.8,4.8,38.1


In [162]:
slider_data.to_csv("./datasets/pitcher_slider_tracking_data.csv")

In [167]:
changeup_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='체인지업']
changeup_data.columns = changeup_data.columns[:4].append("체인지업_"+changeup_data.columns[4:])
changeup_data = changeup_data.reset_index(drop = True)
changeup_data

Unnamed: 0,연도,선수,팀,구종,체인지업_구사%,체인지업_구속,체인지업_상하_무브먼트,체인지업_좌우_무브먼트,체인지업_분당_회전수,체인지업_좌우_릴리스포인트,체인지업_상하_릴리스포인트,체인지업_타구_속도,체인지업_상하_각도,체인지업_피안타율,체인지업_피장타율,체인지업_땅볼%,체인지업_라인드라이브%,체인지업_뜬공%,체인지업_팝플라이%,체인지업_강한_타구%
0,2023,주권,KT,체인지업,56.0,126.2,17.6,-18.2,1723.6,-37.2,165.3,131.3,19.1,0.204,0.255,28.0,40.0,20.0,12.0,13.3
1,2023,엄상백,KT,체인지업,53.5,128.5,12.2,-25.7,1938.9,-64.4,143.4,132.1,22.7,0.256,0.361,30.6,26.8,22.9,19.7,12.7
2,2023,이재학,NC,체인지업,46.1,121.6,6.7,-29.1,1906.0,-61.0,164.3,130.6,23.9,0.178,0.297,24.4,36.0,19.8,19.8,11.6
3,2023,신민혁,NC,체인지업,41.1,126.4,19.2,-23.9,2040.8,-21.3,174.8,126.1,27.3,0.230,0.375,23.0,22.2,34.9,19.8,11.9
4,2023,김동혁,키움,체인지업,40.0,120.9,-1.9,-29.0,1842.6,-71.1,115.0,135.5,13.6,0.265,0.338,39.5,36.8,10.5,13.2,18.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,2021,켈리,LG,체인지업,8.3,132.4,17.6,-21.9,2009.8,-42.3,190.8,130.5,11.4,0.225,0.250,34.5,34.5,24.1,6.9,13.8
184,2021,최민준,SSG,체인지업,7.2,133.4,21.8,-16.2,1953.4,-11.2,176.7,138.7,12.8,0.333,0.433,26.3,52.6,10.5,10.5,26.3
185,2021,미란다,두산,체인지업,6.9,131.1,26.9,21.0,2384.9,31.8,186.6,141.2,25.5,0.294,0.412,30.8,15.4,15.4,38.5,15.4
186,2021,안우진,키움,체인지업,5.8,133.7,23.5,-20.8,2209.1,-32.4,173.3,120.2,4.0,0.103,0.103,50.0,50.0,0.0,0.0,0.0


In [168]:
changeup_data.to_csv("./datasets/pitcher_changeup_tracking_data.csv")

In [169]:
fork_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='포크']
fork_data.columns = fork_data.columns[:4].append("포크_"+fork_data.columns[4:])
fork_data = fork_data.reset_index(drop = True)
fork_data

Unnamed: 0,연도,선수,팀,구종,포크_구사%,포크_구속,포크_상하_무브먼트,포크_좌우_무브먼트,포크_분당_회전수,포크_좌우_릴리스포인트,포크_상하_릴리스포인트,포크_타구_속도,포크_상하_각도,포크_피안타율,포크_피장타율,포크_땅볼%,포크_라인드라이브%,포크_뜬공%,포크_팝플라이%,포크_강한_타구%
0,2023,서진용,SSG,포크,49.4,129.4,5.1,-14.9,1125.5,-36.0,183.0,135.5,16.0,0.213,0.250,36.4,36.4,18.2,9.1,28.6
1,2023,조민석,NC,포크,47.9,133.7,9.0,-17.7,1425.9,-40.1,170.9,141.9,13.3,0.259,0.352,31.8,45.5,13.6,9.1,45.5
2,2023,구승민,롯데,포크,45.6,132.1,10.2,-23.5,1812.8,-44.4,171.4,132.3,18.4,0.268,0.330,30.2,33.3,23.8,12.7,27.0
3,2023,이용찬,NC,포크,45.2,129.8,2.3,-20.2,1408.1,-57.1,173.2,131.8,14.4,0.267,0.376,43.4,30.2,18.9,7.5,17.0
4,2023,김진성,LG,포크,44.3,125.5,13.1,-15.1,1381.2,1.9,198.3,130.3,19.9,0.143,0.188,31.6,31.6,19.3,17.5,15.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,2021,박세웅,롯데,포크,10.9,130.6,6.4,-11.1,1003.8,-23.6,170.5,123.0,15.2,0.172,0.184,39.0,26.8,24.4,9.8,9.8
117,2021,장현식,KIA,포크,10.8,138.0,16.6,-13.7,1582.9,-42.1,179.5,129.5,12.5,0.120,0.120,50.0,25.0,8.3,16.7,16.7
118,2021,가빌리오,SSG,포크,9.2,130.5,14.6,-19.6,1731.6,-20.6,178.3,133.9,13.4,0.314,0.371,33.3,37.0,25.9,3.7,25.9
119,2021,루친스키,NC,포크,9.0,135.7,10.5,-13.4,1306.2,-73.0,172.3,132.6,13.6,0.226,0.290,48.6,20.0,20.0,11.4,22.9


In [170]:
fork_data.to_csv("./datasets/pitcher_fork_tracking_data.csv")

In [171]:
two_seam_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='투심']
two_seam_data.columns = two_seam_data.columns[:4].append("투심_"+two_seam_data.columns[4:])
two_seam_data = two_seam_data.reset_index(drop = True)
two_seam_data

Unnamed: 0,연도,선수,팀,구종,투심_구사%,투심_구속,투심_상하_무브먼트,투심_좌우_무브먼트,투심_분당_회전수,투심_좌우_릴리스포인트,투심_상하_릴리스포인트,투심_타구_속도,투심_상하_각도,투심_피안타율,투심_피장타율,투심_땅볼%,투심_라인드라이브%,투심_뜬공%,투심_팝플라이%,투심_강한_타구%
0,2023,정우영,LG,투심,77.6,148.2,-2.7,-27.6,2178.4,-78.7,133.2,140.0,9.5,0.310,0.381,45.8,38.6,12.0,3.6,33.7
1,2023,박준표,KIA,투심,73.1,140.4,6.3,-25.3,1933.5,-43.2,146.1,139.9,13.8,0.260,0.377,54.2,25.0,8.3,12.5,37.5
2,2023,곽도규,KIA,투심,72.2,144.6,12.2,31.3,2514.8,79.2,152.3,135.7,11.1,0.273,0.333,37.5,37.5,25.0,0.0,12.5
3,2023,이재익,삼성,투심,71.2,137.4,16.9,20.6,1942.3,44.9,178.1,134.2,15.1,0.272,0.413,38.9,33.3,18.5,9.3,20.4
4,2023,박종훈,SSG,투심,59.1,131.3,-18.8,-13.8,1662.0,-46.1,59.9,139.5,9.3,0.312,0.434,48.0,38.0,12.0,2.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,2021,윤중현,KIA,투심,9.8,133.4,-3.9,-26.6,1871.0,-104.3,121.1,134.9,-0.3,0.300,0.300,80.0,20.0,0.0,0.0,6.7
118,2021,프랑코,롯데,투심,9.8,146.0,20.6,-22.5,2374.0,-51.9,182.0,140.0,21.0,0.260,0.440,30.3,24.2,33.3,12.1,33.3
119,2021,뷰캐넌,삼성,투심,7.6,143.3,22.8,-18.9,2286.1,-56.7,186.2,130.6,12.7,0.360,0.460,39.5,34.2,15.8,10.5,18.4
120,2021,임기영,KIA,투심,5.1,133.7,7.4,-26.3,1929.0,-82.3,141.2,130.9,17.4,0.378,0.541,21.4,53.6,21.4,3.6,25.0


In [172]:
two_seam_data.to_csv("./datasets/pitcher_two_seam_tracking_data.csv")

In [173]:
cutter_data = final_extracted_pitcher_df[final_extracted_pitcher_df['구종']=='커터']
cutter_data.columns = cutter_data.columns[:4].append("커터_"+cutter_data.columns[4:])
cutter_data = cutter_data.reset_index(drop = True)
cutter_data

Unnamed: 0,연도,선수,팀,구종,커터_구사%,커터_구속,커터_상하_무브먼트,커터_좌우_무브먼트,커터_분당_회전수,커터_좌우_릴리스포인트,커터_상하_릴리스포인트,커터_타구_속도,커터_상하_각도,커터_피안타율,커터_피장타율,커터_땅볼%,커터_라인드라이브%,커터_뜬공%,커터_팝플라이%,커터_강한_타구%
0,2023,파노니,KIA,커터,46.2,138.3,19.7,3.7,1467.1,52.4,163.2,126.4,22.0,0.222,0.285,36.4,21.5,23.4,18.7,18.7
1,2023,뷰캐넌,삼성,커터,37.8,141.3,13.0,1.0,1089.4,-61.5,185.1,131.3,15.9,0.23,0.266,35.4,36.8,17.4,10.4,18.1
2,2023,최민준,SSG,커터,37.2,138.8,16.5,-0.8,1292.8,-44.9,176.8,135.1,18.0,0.34,0.474,28.1,46.9,9.4,15.6,26.6
3,2023,맥카티,SSG,커터,31.5,137.1,13.7,-3.5,1134.7,53.9,178.2,132.6,20.0,0.253,0.319,28.7,34.0,22.3,14.9,20.2
4,2023,쿠에바스,KT,커터,27.7,138.1,16.1,5.0,1314.1,-50.7,170.0,134.1,22.7,0.25,0.327,25.3,28.0,30.7,16.0,20.0
5,2023,신민혁,NC,커터,27.5,135.5,17.0,0.7,1277.2,-30.7,170.3,133.2,23.4,0.238,0.357,23.4,28.7,30.9,17.0,22.3
6,2023,브랜든,두산,커터,27.1,138.0,10.7,-1.6,915.1,64.9,184.5,131.0,16.7,0.268,0.28,29.8,38.3,23.4,8.5,14.9
7,2023,메디나,KIA,커터,24.0,136.4,7.9,3.0,684.3,-72.9,173.5,122.3,21.5,0.196,0.304,28.6,32.1,14.3,25.0,10.7
8,2023,벤자민,KT,커터,23.7,136.1,11.8,5.2,1030.7,71.9,187.7,137.8,19.6,0.224,0.32,35.5,27.6,22.4,14.5,30.3
9,2023,이건욱,SSG,커터,21.3,135.6,15.8,-3.9,1257.1,-18.9,177.5,133.0,20.4,0.217,0.217,41.7,8.3,33.3,16.7,25.0


In [174]:
cutter_data.to_csv("./datasets/pitcher_cutter_tracking_data.csv")

In [164]:
final_extracted_pitcher_df['구종'].unique()

array(['직구', '커브', '슬라이더', '체인지업', '포크', '투심', '커터'], dtype=object)

# hitter data crawling

In [183]:
# 2i tracking website url
url = "https://m.2itracking.com/Default.aspx"

driver = webdriver.Chrome()

driver.get(url)
time.sleep(3)

toggle_box = driver.find_element(By.XPATH, '//*[@id="btnHamburger"]')
toggle_box.click()
time.sleep(3)

player_rank_box = driver.find_element(By.XPATH, '//*[@id="dvMenu"]/ul/li[2]/a')
player_rank_box.click()
driver.implicitly_wait(3)

# select hitter
hitter_box = driver.find_element(By.XPATH, '//*[@id="contents"]/section/div/ul/li[2]/a')
hitter_box.click()
driver.implicitly_wait(3)

final_extracted_hitter_df = pd.DataFrame()
for year_idx in range(1, 4):
    year = 2024-year_idx
    # select year
    year_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlSeason"]'))
    year_box.select_by_index(year_idx)
    driver.implicitly_wait(3)

    team_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlTeam"]'))
    team_box.select_by_index(1) # all teams
    driver.implicitly_wait(3)

    pitch_kind_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlPitKind"]'))
    pitch_kind_box.select_by_index(1) # against every pitch kind
    driver.implicitly_wait(3)

    pitch_count_box = Select(driver.find_element(By.XPATH, '//*[@id="ddlHitRowCount"]'))
    pitch_count_box.select_by_index(3) # pitched the ball more than 100 times
    driver.implicitly_wait(3)

    # find the hitter data
    search_box = driver.find_element(By.XPATH, '//*[@id="btnSearch"]')
    search_box.click()
    time.sleep(3)
    
    extraced_hitter_data = extract_data(driver, year)
    extraced_hitter_data = extract_data(driver, year)
    final_extracted_hitter_df = pd.concat([final_extracted_hitter_df,
                                            extraced_hitter_data],
                                            axis = 0)


In [184]:
final_extracted_hitter_df = final_extracted_hitter_df.reset_index(drop = True)
final_extracted_hitter_df

Unnamed: 0,연도,선수,팀,구종,상대%,타구 속도,상하 각도,타율,장타율,땅볼%,라인드라이브%,뜬공%,팝플라이%,강한 타구%,안타,안타 타구 속도,홈런,홈런 타구 속도,장타,장타 타구 속도
0,2023,강현우,KT,전체,100.0,134.9,16.9,0.194,0.282,32.8,36.2,19.0,12.1,22.4,20,143.1,1,154.5,7,147.6
1,2023,이주형,키움,전체,100.0,137.9,19.7,0.326,0.507,28.6,34.1,24.6,12.7,34.9,70,143.0,6,156.3,23,154.6
2,2023,안권수,롯데,전체,100.0,130.5,16.6,0.272,0.325,30.6,40.0,20.6,8.8,9.4,72,133.0,2,151.0,9,138.5
3,2023,양찬열,두산,전체,100.0,120.6,22.6,0.250,0.321,26.9,28.8,28.8,15.4,9.6,21,129.9,0,0.0,4,140.6
4,2023,러셀,키움,전체,100.0,137.2,18.6,0.286,0.400,30.2,35.3,24.5,10.1,24.5,63,140.7,4,153.1,16,148.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,2021,허경민,두산,전체,100.0,132.7,12.5,0.278,0.365,41.4,32.3,17.7,8.6,21.2,130,138.9,5,147.9,30,143.6
384,2021,박동원,키움,전체,100.0,137.9,18.9,0.245,0.458,34.0,28.5,19.8,17.8,35.6,100,147.1,22,156.7,43,153.4
385,2021,김상수,삼성,전체,100.0,131.5,17.1,0.235,0.301,39.5,27.8,17.9,14.8,11.4,101,137.8,3,154.2,21,145.4
386,2021,이지영,키움,전체,100.0,127.3,8.0,0.276,0.303,47.5,33.1,14.4,5.0,5.5,63,133.9,0,0.0,5,142.2


In [185]:
final_extracted_hitter_df.to_csv("./datasets/hitter_tracking_data.csv")

유의미한 타구 속도 수치의 기준
https://premium.sbs.co.kr/article/pTfAdX8RfJA