In [41]:
import numpy as np
import pandas as pd
import hvplot.pandas
import datetime as dt
import requests
import re
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm

In [42]:
# Set the random seed for reproducibility
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

In [43]:
# Pulling website's source code using BeautifulSoup

url = "http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2021/start/1"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

In [44]:
# creating dataframe structure

header = soup.find("tr", attrs = {"class": "colhead"})

columns = [col.get_text() for col in header.find_all("td")]
final_df = pd.DataFrame(columns=columns)
final_df

Unnamed: 0,Unnamed: 1,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA


In [45]:
# created a loop to capture all the players in the league,


for i in range(1,350,50):


    url = "http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2021/start/{}".format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    
    players_2021 = soup.find_all("tr", attrs = {"class": re.compile("row player-10-")})
    for player in players_2021:
        stats = [stat.get_text() for stat in player.find_all("td")]
        
        temp_df = pd.DataFrame(stats).transpose()
        temp_df.columns = columns
        
        mlb_all_stats_2021 = pd.concat([mlb_all_stats_2021, temp_df], ignore_index = True)
mlb_all_stats_2021

Unnamed: 0,Unnamed: 1,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
0,1,Luis Robert,2,68,275,42,93,22,1,13,43,14,61,6,1,.338
1,2,Trea Turner,7,148,595,107,195,34,3,28,77,41,110,32,5,.328
2,3,Frank Schwindel,2,64,242,44,79,20,1,14,43,16,41,2,1,.326
3,4,Yuli Gurriel,6,143,530,83,169,31,0,15,81,59,68,1,1,.319
4,5,Ketel Marte,7,90,340,52,108,29,1,14,50,31,60,2,0,.318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,335,Martin Maldonado,11,125,373,40,64,10,1,12,36,47,127,0,0,.172
674,336,Matt Carpenter,11,130,207,18,35,11,1,3,21,35,77,2,0,.169
675,337,Cody Bellinger,5,95,315,39,52,9,2,10,36,31,94,3,1,.165
676,338,Jackie Bradley Jr.,9,134,387,39,63,14,3,6,29,28,132,7,1,.163


In [46]:
mlb_all_stats_2021.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
673     True
674     True
675     True
676     True
677     True
Length: 678, dtype: bool

In [47]:
mlb_all_stats_2021.PLAYER.nunique()

339

In [65]:
# Loop through each year 
year = 2006
while year < 2022:
    print(year)
    for i in tqdm(range(1,331,50)):

        # Pull website source code
        url = "http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/{str(year)}/start/{i}"
        page = requests.get(url)
        soup = BeautifulSoup(page.text, "html.parser")

         # Pull in player rows
        players = soup.find_all("tr", attrs={"class": re.compile("row player-10-")})
        for player in players:

         # Get stats for each player
         stats = [stat.get_text() for stat in player.find_all("td")]   

         # Create a dataframe for the single player's stats
         temp_df = pd.DataFrame(stats).transpose()   
         temp_df.columns = columns

         # Join the single player's stat with dataset    
         final_df = pd.concat([final_df,temp_df], ignore_index=True)
    year +=1 

final_df       

2006


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.42s/it]


2007


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.51s/it]


2008


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.40s/it]


2009


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.40s/it]


2010


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.42s/it]


2011


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.56s/it]


2012


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.52s/it]


2013


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.54s/it]


2014


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.32s/it]


2015


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.46s/it]


2016


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.60s/it]


2017


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.37s/it]


2018


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.54s/it]


2019


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.43s/it]


2020


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.58s/it]


2021


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.37s/it]


Unnamed: 0,Unnamed: 1,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
0,1,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,.366
1,2,Rogers Hornsby,23,2259,8173,1579,2930,541,169,301,1318,1038,679,135,64,.358
2,3,Joe Jackson,13,1332,4981,873,1772,307,168,54,121,519,158,202,61,.356
3,4,Ed Delahanty,16,1835,7505,1599,2596,522,185,101,0,741,244,455,0,.346
4,5,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,.345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,,Chuck Klein,17,1753,6486,1168,2076,398,74,300,1201,601,521,79,0,.320
13996,47,Ken Williams,14,1397,4862,860,1552,285,77,196,860,566,287,154,106,.319
13997,48,Kirby Puckett,12,1783,7244,1071,2304,414,57,207,1085,450,965,134,76,.318
13998,,Vladimir Guerrero,16,2147,8155,1328,2590,477,46,449,1496,737,985,181,94,.318


In [60]:
# final_df.isnull().sum()

In [None]:
# final_df.dropna(inplace=True)

In [None]:
#final_df.to_csv(r"Resources/MLB_stats_2021.csv", index = False, sep=",", encoding ="utf-8")

In [67]:
# Export to csv
final_df.to_csv("mlb_stat.csv", index=False, sep=",", encoding="utf-8")