# Best Baseball Team Project
Goal: To use data scraping to pull data from MLB's website with the intention of building the best playoffs team in baseball. 
Currently, as of June 27, 2024 the best team batting average is .263 and the best ERA is 3.07. This means that the team with the best batting average gets a hit 26.3% on any given at-bat for any given player, and the team with the best ERA (earned run average) allows 3.07 runs per nine innings. Of course these can change, but if the current best teams went to the playoffs today, we would statistically have a better team. The goal for our team is to win the World Series. With this in mind, we set the following parameters:

Amount of Players on Roster: 26
Team Batting Average: .300
Team ERA: 2.50


## Data Scraping Position Players' Data

In [16]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

urls = [
    'https://www.mlb.com/stats/',
    'https://www.mlb.com/stats/?page=2',
    'https://www.mlb.com/stats/?page=3',
    'https://www.mlb.com/stats/?page=4',
    'https://www.mlb.com/stats/?page=5',
    'https://www.mlb.com/stats/?page=6'
]

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

def scrape_player_page(url):
    driver.get(url)
    try:
        # Wait until the AVG column is loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'td[headers="tb-0-header-col14"]'))
        )
        page_content = driver.page_source
        soup = BeautifulSoup(page_content, 'html.parser')

        avg_cells = soup.find_all('td', {'headers': 'tb-0-header-col14'})
        name_containers = soup.find_all('a', class_='bui-link')
        position_containers = soup.find_all('div', class_='position-SAxuJGcx')

        player_names = []
        for container in name_containers:
            spans = container.find_all('span', class_='short-IiSPVSQp')
            full_name = ' '.join(span.text.strip() for span in spans if span.text.strip())
            if full_name:
                player_names.append(full_name)

        player_positions = [pos.text.strip() for pos in position_containers]

        players_avg_position = [(name, avg.text.strip(), pos) for name, avg, pos in zip(player_names[:len(avg_cells)], avg_cells, player_positions[:len(avg_cells)])]
        return players_avg_position
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

all_players_avg_position = []
total_players = 0
for url in urls:
    players_avg_position = scrape_player_page(url)
    all_players_avg_position.extend(players_avg_position)
    total_players += len(players_avg_position)

with open('players_stats.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Player', 'AVG', 'Position'])  # Write header
    writer.writerows(all_players_avg_position)

print(f"Total number of players: {total_players}")
driver.quit()


Total number of players: 146


## Data Scraping Pitchers' Data

In [17]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

purls = [
    'https://www.mlb.com/stats/pitching?sortState=asc',
    'https://www.mlb.com/stats/pitching?page=2&sortState=asc',
    'https://www.mlb.com/stats/pitching?page=3&sortState=asc'
]


service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

def scrape_pitcher_page(url):
    driver.get(url)
    try:
        # Wait until the ERA column is loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'td[data-col="4"]'))
        )
        page_content = driver.page_source
        soup = BeautifulSoup(page_content, 'html.parser')

        era_cells = soup.find_all('td', {'data-col': '4'})
        name_containers = soup.find_all('a', class_='bui-link')

        pitcher_names = []
        for container in name_containers:
            spans = container.find_all('span', class_='short-IiSPVSQp')
            full_name = ' '.join(span.text.strip() for span in spans if span.text.strip())
            if full_name:
                pitcher_names.append(full_name)
        
        pitchers_era = [(name, era.text.strip()) for name, era in zip(pitcher_names[:len(era_cells)], era_cells)]
        return pitchers_era
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

all_pitchers_era = []
total_pitchers = 0
for url in purls:
    pitchers_era = scrape_pitcher_page(url)
    all_pitchers_era.extend(pitchers_era)
    total_pitchers += len(pitchers_era)

with open('pitchers_stats.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Pitcher', 'ERA'])  # Write header
    writer.writerows(all_pitchers_era)

print(f"Total number of pitchers: {total_pitchers}")
driver.quit()


Total number of pitchers: 75


## Data Analysis to Choose Best Players and Pitchers

In [28]:
import pandas as pd

players_df = pd.read_csv('players_stats.csv')

pitchers_df = pd.read_csv('pitchers_stats.csv')

pitchers_df['ERA'] = pd.to_numeric(pitchers_df['ERA'], errors='coerce')
lowest_eras = pitchers_df.nsmallest(13, 'ERA')
lowest_eras_array = lowest_eras[['Pitcher', 'ERA']].values
print(lowest_eras_array)

players_df['AVG'] = pd.to_numeric(players_df['AVG'], errors='coerce')
positions = ['C', '1B', '2B', '3B', 'SS', 'LF', 'CF', 'RF', 'DH']
top_players_by_position = []

for pos in positions:
    pos_df = players_df[players_df['Position'] == pos]
    if not pos_df.empty:
        highest_avg_player = pos_df.loc[pos_df['AVG'].idxmax()]
        top_players_by_position.append(highest_avg_player)
        
top_players_by_position_df = pd.DataFrame(top_players_by_position)

remaining_spots = 13 - len(top_players_by_position_df)
top_overall_players = players_df.sort_values(by='AVG', ascending=False).head(remaining_spots + len(top_players_by_position_df))
top_players_combined = pd.concat([top_players_by_position_df, top_overall_players]).drop_duplicates().head(13)
top_players_array = top_players_combined[['Player', 'AVG', 'Position']].values
print(top_players_array)

avg_batting_average = round(top_players_combined['AVG'].mean(), 3)
print(f"Average Batting Average: {avg_batting_average}")

avg_era = round(lowest_eras['ERA'].mean(), 2)
print(f"Average ERA: {avg_era}")


[['R López' 1.7]
 ['R Suárez' 2.01]
 ['T Houck' 2.18]
 ['C Burnes' 2.28]
 ['S Lugo' 2.29]
 ['T Skubal' 2.32]
 ['R Blanco' 2.34]
 ['T Anderson' 2.63]
 ['C Sánchez' 2.67]
 ['L Gilbert' 2.71]
 ['Z Wheeler' 2.73]
 ['G Stone' 2.73]
 ['C Sale' 2.79]]
[['A Rutschman' 0.3 'C']
 ['L Arraez' 0.31 '1B']
 ['J Altuve' 0.303 '2B']
 ['L Rengifo' 0.312 '3B']
 ['B Witt Jr.' 0.31 'SS']
 ['J Profar' 0.316 'LF']
 ['A Judge' 0.304 'CF']
 ['J Soto' 0.3 'RF']
 ['S Ohtani' 0.322 'DH']
 ['C Correa' 0.309 'SS']
 ['M Betts' 0.304 'SS']
 ['M Ozuna' 0.303 'DH']
 ['B Harper' 0.303 '1B']]
Average Batting Average: 0.307
Average ERA: 2.41


## Conclusion
To conclude this analyis, we observe that we exceeded both parameters. Our batting average is above our goal of 0.300 and our ERA is below 2.50. If we went to the World Series today, we would have a great shot at winning!