In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
import threading

In [2]:
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
def get_data(year, player_type, data_type):
  """
  Scrapes data from baseball-reference.com
  player_type is either batting or pitching
  data_type indicates whether it is individual or team data
  """
  driver = webdriver.Chrome(chromedriver)
  url = f"https://www.baseball-reference.com/leagues/majors/{year}-standard-{player_type}.shtml"
  driver.get(url)
  if data_type == "player":
    driver.execute_script("window.scrollTo(0, 1500);")
  else:
    driver.execute_script("window.scrollTo(0, 750);")

  driver.find_element_by_xpath(f"//*[@id=\"{data_type}_standard_{player_type}_sh\"]/div/ul/li[1]/span").click()
  # time.sleep(1)
  driver.find_element_by_xpath(f"//*[@id=\"{data_type}_standard_{player_type}_sh\"]/div/ul/li[1]/div/ul/li[3]/button").click()
  soup = BeautifulSoup(driver.page_source, "html.parser")
  text_data = "\n".join(soup.find("pre", id=f"csv_{data_type}_standard_{player_type}").text.split("\n")[4:-1])
  with open(f"data/{data_type}_data/{player_type}{year}.csv", "w") as f:
    f.write(text_data)
  driver.quit()


In [None]:
threads = []
year_ranges = ((1997, 2000), (2000, 2004), (2004, 2008), (2008, 2012), (2012, 2014))
for year_range in year_ranges:
  for year in range(*year_range):
    thread = threading.Thread(target=get_data, args=(year, "batting", "player",))
    threads.append(thread)
    thread.start()

  for thread in threads:
    thread.join()

In [None]:
threads = []
year_ranges = ((1997, 2000), (2000, 2004), (2004, 2008), (2008, 2012), (2012, 2014))
for year_range in year_ranges:
  for year in range(*year_range):
    thread = threading.Thread(target=get_data, args=(year, "pitching", "player", ))
    threads.append(thread)
    thread.start()

  for thread in threads:
    thread.join()

In [None]:
threads = []
year_ranges = ((1997, 2000), (2000, 2003), (2003, 2006), (2006, 2009), (2009, 2012), (2012, 2015), (2015, 2017))
for player_type in ["pitching", "batting"]:
  for year_range in year_ranges:
    for year in range(*year_range):
      thread = threading.Thread(target=get_data, args=(year, player_type, "teams", ))
      threads.append(thread)
      thread.start()

    for thread in threads:
      thread.join()

In [5]:
def get_fa_data(year):
  """
  Scrapes free agent data from baseball-reference.com
  """
  driver = webdriver.Chrome(chromedriver)
  url = f"https://www.baseball-reference.com/leagues/majors/{year}-free-agents.shtml"
  driver.get(url)
  driver.execute_script("window.scrollTo(0, 750);")

  driver.find_element_by_xpath(f"//*[@id=\"fa_signings_sh\"]/div/ul/li[1]/span").click()
  # time.sleep(1)
  driver.find_element_by_xpath(f"//*[@id=\"fa_signings_sh\"]/div/ul/li[1]/div/ul/li[3]/button").click()
  soup = BeautifulSoup(driver.page_source, "html.parser")
  text_data = "\n".join(soup.find("pre", id=f"csv_fa_signings").text.split("\n")[4:-1])
  with open(f"data/fa_data/fa{year}.csv", "w") as f:
    f.write(text_data)
  driver.quit()


In [8]:
threads = []
year_ranges = ((2000, 2004), (2004, 2008), (2008, 2012), (2012, 2016), (2016, 2020), (2020, 2022))
for year_range in year_ranges:
  for year in range(*year_range):
    thread = threading.Thread(target=get_fa_data, args=(year,))
    threads.append(thread)
    thread.start()

  for thread in threads:
    thread.join()

Exception in thread Thread-24:
Traceback (most recent call last):
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-5-37b9f2bc88a6>", line 12, in get_fa_data
    driver.find_element_by_xpath(f"//*[@id=\"fa_signings_sh\"]/div/ul/li[1]/div/ul/li[3]/button").click()
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 633, in _execute
    return self._parent.execute(command, params)
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execu

In [22]:
get_fa_data(2014)

In [17]:
def get_fielding_data(year, data_type):
  """
  Scrapes data from baseball-reference.com
  data_type indicates whether it is individual or team data
  """
  driver = webdriver.Chrome(chromedriver)
  url = f"https://www.baseball-reference.com/leagues/majors/{year}-standard-fielding.shtml"
  driver.get(url)
  if data_type == "player":
    driver.execute_script("window.scrollTo(0, 1500);")
  else:
    driver.execute_script("window.scrollTo(0, 750);")
  path_dt = "teams" if data_type == "teams" else "players_players"
  path_pt = "fielding" if data_type == "teams" else "fielding_fielding"

  driver.find_element_by_xpath(f"//*[@id=\"{path_dt}_standard_{path_pt}_sh\"]/div/ul/li[1]/span").click()
  # time.sleep(1)
  driver.find_element_by_xpath(f"//*[@id=\"{path_dt}_standard_{path_pt}_sh\"]/div/ul/li[1]/div/ul/li[3]/button").click()
  soup = BeautifulSoup(driver.page_source, "html.parser")
  text_data = "\n".join(soup.find("pre", id=f"csv_{path_dt}_standard_{path_pt}").text.split("\n")[4:-1])
  with open(f"data/{data_type}_data/fielding{year}.csv", "w") as f:
    f.write(text_data)
  driver.quit()

In [20]:
threads = []
year_ranges = ((1997, 2001), (2001, 2005), (2005, 2009), (2009, 2013), (2013, 2017), (2017, 2021))
for data_type in ["player", "teams"]:
  for year_range in year_ranges:
    for year in range(*year_range):
      thread = threading.Thread(target=get_fielding_data, args=(year, data_type, ))
      threads.append(thread)
      thread.start()

    for thread in threads:
      thread.join()

Exception in thread Thread-39:
Traceback (most recent call last):
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-17-438a3cbd10e5>", line 18, in get_fielding_data
    driver.find_element_by_xpath(f"//*[@id=\"{path_dt}_standard_{path_pt}_sh\"]/div/ul/li[1]/div/ul/li[3]/button").click()
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 633, in _execute
    return self._parent.execute(command, params)
  File "/Users/chrisunjae/anaconda3/envs/cs109a/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver

In [21]:
get_fielding_data(2000, "teams")