In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the base URLs for the different stat categories
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# Function to scrape data from a specific URL and save it as a CSV file
def scrape_data(url, filename):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table with the class 'stats_table'
    table = soup.find('table', {'class': 'stats_table'})
    if table:
        # Extract headers
        headers = [th.text.strip() for th in table.find_all('th')]
        
        # Debugging: Print the number of columns in headers
        print(f'Number of columns in headers: {len(headers)}')

        # Extract rows
        rows = []
        for tr in table.find_all('tr')[1:]:  # Skip the header row
            cols = tr.find_all('td')
            if cols:
                rows.append([td.text.strip() for td in cols])

        # Debugging: Print the number of columns in the first row
        if rows:
            print(f'Number of columns in first row: {len(rows[0])}')

        # Adjust the number of columns if there's a mismatch
        if len(headers[1:]) != len(rows[0]):  # headers[1:] skips the empty first header
            headers = headers[:1] + headers[1:len(rows[0])+1]  # Match header length to row length

        # Create a DataFrame and save it as a CSV file
        df = pd.DataFrame(rows, columns=headers[1:])  # Avoid first empty header
        df.to_csv(f'{filename}.csv', index=False)
        print(f'{filename}.csv saved.')
    else:
        print(f'No table found at {url}')

# Function to scrape data for a range of years
def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        print(f'Scraping data for {year}...')

        # Scrape data for each category
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')

# Specify the range of years (e.g., from 2015 to 2024)
scrape_data_for_years(2015, 2024)


In [None]:
# Different version of the scraper above that ignores the 2020 season entirely due to issues with collecting data after that year 

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

def scrape_data(url, filename):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table')
        
        if not table:
            print(f'No table found at {url}')
            return
        
        headers = [th.get_text() for th in table.find_all('th')]
        rows = []
        for row in table.find_all('tr'):
            rows.append([td.get_text() for td in row.find_all('td')])
        
        # Check if the headers and data match up
        if len(headers) > 1 and len(headers) != len(rows[0]):
            print(f"Number of columns in headers: {len(headers)}")
            print(f"Number of columns in first row: {len(rows[0])}")
        
        # Convert to DataFrame and save as CSV
        df = pd.DataFrame(rows, columns=headers)
        df.to_csv(f'{filename}.csv', index=False)
        print(f'{filename}.csv saved.')
        
    except requests.HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'An error occurred: {err}')

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        # Skip the 2020 season entirely
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        # Scrape each category
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')

# Run the scraper for 2015-2019 and 2021-2024, skipping 2020
scrape_data_for_years(2015, 2024)


In [None]:
# Another adjusted version that deals with the "too many requests error", basically adding in a delay between requests

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

def scrape_data(url, filename):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table')
        
        if not table:
            print(f'No table found at {url}')
            return
        
        headers = [th.get_text() for th in table.find_all('th')]
        rows = []
        for row in table.find_all('tr'):
            rows.append([td.get_text() for td in row.find_all('td')])
        
        # Convert to DataFrame and save as CSV
        df = pd.DataFrame(rows, columns=headers)
        df.to_csv(f'{filename}.csv', index=False)
        print(f'{filename}.csv saved.')
        
    except requests.HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'An error occurred: {err}')

def scrape_data_for_years(start_year, end_year, delay=5):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        time.sleep(delay)
        
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        time.sleep(delay)
        
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        time.sleep(delay)
        
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        time.sleep(delay)
        
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        time.sleep(delay)
        
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')
        time.sleep(delay)

# Run the scraper for 2015-2019 and 2021-2024, with a 5-second delay between requests
scrape_data_for_years(2015, 2024, delay=5)


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# List of user agents to rotate
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Function to scrape data with retry logic and exponential backoff
def scrape_data(url, filename, retries=5, backoff_factor=2):
    attempt = 0
    delay = 5
    
    while attempt < retries:
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # If the response is successful, scrape the data
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            if not table:
                print(f'No table found at {url}')
                return
            
            headers = [th.get_text() for th in table.find_all('th')]
            rows = []
            for row in table.find_all('tr'):
                rows.append([td.get_text() for td in row.find_all('td')])
            
            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(f'{filename}.csv', index=False)
            print(f'{filename}.csv saved.')
            break  # Break out of the loop if successful
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f'HTTP 429 error: Too many requests. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            else:
                print(f'HTTP error occurred: {http_err}')
                break
        except Exception as err:
            print(f'An error occurred: {err}')
            break

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')

# Run the scraper for 2015-2019 and 2021-2024
scrape_data_for_years(2015, 2024)


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# List of user agents to rotate
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Function to scrape data with retry logic and exponential backoff
def scrape_data(url, filename, retries=5, backoff_factor=2):
    attempt = 0
    delay = 5
    
    while attempt < retries:
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # If the response is successful, scrape the data
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            if not table:
                print(f'No table found at {url}')
                return
            
            headers = [th.get_text() for th in table.find_all('th')]
            rows = []
            for row in table.find_all('tr'):
                rows.append([td.get_text() for td in row.find_all('td')])
            
            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(f'{filename}.csv', index=False)
            print(f'{filename}.csv saved.')
            break  # Break out of the loop if successful
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f'HTTP 429 error: Too many requests. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            else:
                print(f'HTTP error occurred: {http_err}')
                break
        except Exception as err:
            print(f'An error occurred: {err}')
            break

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')

# Run the scraper for 2015-2019 and 2021-2024
scrape_data_for_years(2015, 2024)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# List of user agents to rotate
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# List of proxies to rotate
proxies = [
    "http://proxy1.example.com:8080",
    "http://proxy2.example.com:8080",
    "http://proxy3.example.com:8080"
]

# Function to scrape data with retry logic and exponential backoff
def scrape_data(url, filename, retries=5, backoff_factor=2):
    attempt = 0
    delay = 5
    
    while attempt < retries:
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            proxy = {
                "http": random.choice(proxies),
                "https": random.choice(proxies)
            }
            response = requests.get(url, headers=headers, proxies=proxy)
            response.raise_for_status()
            
            # If the response is successful, scrape the data
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            if not table:
                print(f'No table found at {url}')
                return
            
            headers = [th.get_text() for th in table.find_all('th')]
            rows = []
            for row in table.find_all('tr'):
                rows.append([td.get_text() for td in row.find_all('td')])
            
            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(f'{filename}.csv', index=False)
            print(f'{filename}.csv saved.')
            break  # Break out of the loop if successful
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f'HTTP 429 error: Too many requests. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            else:
                print(f'HTTP error occurred: {http_err}')
                break
        except Exception as err:
            print(f'An error occurred: {err}')
            break

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')
        
        # Random delay between requests to avoid rate limiting
        time.sleep(random.uniform(10, 30))

# Run the scraper for 2015-2019 and 2021-2024
scrape_data_for_years(2015, 2024)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# List of user agents to rotate
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Function to scrape data with retry logic and exponential backoff
def scrape_data(url, filename, retries=5, backoff_factor=2):
    attempt = 0
    delay = 5
    
    while attempt < retries:
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # If the response is successful, scrape the data
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            if not table:
                print(f'No table found at {url}')
                return
            
            headers = [th.get_text() for th in table.find_all('th')]
            rows = []
            for row in table.find_all('tr'):
                rows.append([td.get_text() for td in row.find_all('td')])
            
            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(f'{filename}.csv', index=False)
            print(f'{filename}.csv saved.')
            break  # Break out of the loop if successful
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f'HTTP 429 error: Too many requests. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            else:
                print(f'HTTP error occurred: {http_err}')
                break
        except Exception as err:
            print(f'An error occurred: {err}')
            break

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')
        
        # Random delay between requests to avoid rate limiting
        time.sleep(random.uniform(10, 30))

# Run the scraper for 2015-2019 and 2021-2024
scrape_data_for_years(2015, 2024)

Scraping data for 2015...
An error occurred: 130 columns passed, passed data had 29 columns
An error occurred: 136 columns passed, passed data had 35 columns
HTTP error occurred: 500 Server Error: Internal Server Error for url: https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year=2015&group=College
No table found at https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year=2015&group=College
No table found at https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year=2015&group=College
No table found at https://www.baseball-reference.com/draft/?query_type=year_round&year_ID=2015&draft_round=1&draft_type=college


KeyboardInterrupt: 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_urls = {
    'batting_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=bat&year={}&group=College',
    'pitching_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=pitch&year={}&group=College',
    'fielding_leaders': 'https://www.baseball-reference.com/register/leader.cgi?request=1&type=field&year={}&group=College',
    'team_batting': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=bat&year={}&group=College',
    'team_pitching': 'https://www.baseball-reference.com/register/team.cgi?request=1&type=pitch&year={}&group=College',
    'drafted_players': 'https://www.baseball-reference.com/draft/?query_type=year_round&year_ID={}&draft_round=1&draft_type=college'
}

# List of user agents to rotate
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Function to scrape data with retry logic and exponential backoff
def scrape_data(url, filename, retries=5, backoff_factor=3):
    attempt = 0
    delay = 10  # Start with a longer initial delay
    
    while attempt < retries:
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # If the response is successful, scrape the data
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            if not table:
                print(f'No table found at {url}')
                return
            
            headers = [th.get_text() for th in table.find_all('th')]
            rows = []
            for row in table.find_all('tr'):
                rows.append([td.get_text() for td in row.find_all('td')])
            
            # Handle cases where the number of columns might not match
            max_columns = max(len(row) for row in rows)
            headers = headers[:max_columns]
            rows = [row[:max_columns] for row in rows]
            
            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(f'{filename}.csv', index=False)
            print(f'{filename}.csv saved.')
            break  # Break out of the loop if successful
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f'HTTP 429 error: Too many requests. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            elif response.status_code == 500:
                print(f'HTTP 500 error: Internal Server Error. Retrying in {delay} seconds...')
                time.sleep(delay)
                delay *= backoff_factor  # Exponential backoff
                attempt += 1
            else:
                print(f'HTTP error occurred: {http_err}')
                break
        except Exception as err:
            print(f'An error occurred: {err}')
            break

def scrape_data_for_years(start_year, end_year):
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f'Skipping the 2020 season...')
            continue
        
        print(f'Scraping data for {year}...')
        
        scrape_data(base_urls['batting_leaders'].format(year), f'college_batting_leaders_{year}')
        scrape_data(base_urls['pitching_leaders'].format(year), f'college_pitching_leaders_{year}')
        scrape_data(base_urls['fielding_leaders'].format(year), f'college_fielding_leaders_{year}')
        scrape_data(base_urls['team_batting'].format(year), f'college_team_batting_{year}')
        scrape_data(base_urls['team_pitching'].format(year), f'college_team_pitching_{year}')
        scrape_data(base_urls['drafted_players'].format(year), f'college_drafted_players_{year}')
        
        # Random delay between requests to avoid rate limiting
        time.sleep(random.uniform(30, 60))

# Run the scraper for 2015-2019 and 2021-2024
scrape_data_for_years(2015, 2024)

Scraping data for 2015...
college_batting_leaders_2015.csv saved.
college_pitching_leaders_2015.csv saved.
HTTP 500 error: Internal Server Error. Retrying in 10 seconds...
HTTP 500 error: Internal Server Error. Retrying in 30 seconds...
HTTP 500 error: Internal Server Error. Retrying in 90 seconds...
HTTP 500 error: Internal Server Error. Retrying in 270 seconds...
