# Introduction
This note book creates and saves a dataframe for the MLB teams. 

# Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Notebook Setup

## Work around 403 error

In [2]:
# a small little ruse
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

## URL's

In [3]:
american_league_url = \
'https://www.mlb.com/stats/american-league/regular-season?playerPool=ALL'  

national_league_url = \
'https://www.mlb.com/stats/national-league/regular-season?playerPool=ALL'

# Functions
I have put most of the work into function so the actual workflow is streamlined below. 

## 1. `create_table`: scrap url pages to create table

### 1a. Helper Function: `make_url`
Convert a base-url to a url-with-a-page-number.

In [4]:
def make_url(url, i):
    """
    converts url to page number url
    
    input:
    'https://www.mlb.com/stats/american-league/regular-season?playerPool=ALL'
    
    output:
    'https://www.mlb.com/stats/american-league/regular-season?page={i}&playerPool=ALL'
    
    """
    split = url.split('?')
    base_url = split[0]+'?page='+str(i)+'&'+split[1]
    #base_url = "'"+base_url+"'"
    return base_url

### 1b. Primary Function: `create_table`
Uses `make_url` to convert base-url to a url with page number then loops through page numbers, to a limit set by second function parameter. Automatically breaks out of loop when no longer collecting data.

In [5]:
def create_table(url, num_of_pages_to_scrap):

    # page number for initializing the table
    i = 1
    
    # request for beautiful soup
    response = requests.get(make_url(url, i), headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the table from website
    table = soup.find('div', class_ = 'stats-body-table player')

    # convert to pandas df
    stats_table = pd.read_html(str(table))[0]
    
    # scrap multiple pages
    # start on page 2, first page used to initialize table
    for i in range(2,num_of_pages_to_scrap):

        # request for beautiful soup
        response = requests.get(make_url(url, i), headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # find the table
        table = soup.find('div', class_ = 'stats-body-table player')

        # convert to pandas df
        pandas_table = pd.read_html(str(table))[0]

        # check if there is data in the table
        if len(pandas_table) > 0:

            # if there is info in table then concat it to initialized table
            stats_table = pd.concat([stats_table, pandas_table])

        # if no data in the table then we have reached the end, break out of loop
        else:
            break
     
    # change column names from NAMENAME to NAME
    fix_col_names = []
    for col in stats_table.columns:
        if 'caret-upcaret-down' in col:
            name = col.split('caret-upcaret-down')[-1]
        else:
            name = col[:int(len(col)/2)]
        fix_col_names.append(name)

    stats_table.columns = fix_col_names
    
    # reset the index, drop old one
    stats_table.reset_index(drop=True, inplace=True)      

    return stats_table

## 2. `replace_with_accent_removed`: recreate table with accents removed from names

### 2a. Helper Function:  `remove_accent_chars_regex`

Code for this function is from [stackoverflow](https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string).

In [6]:
import unicodedata
import regex

def remove_accent_chars_regex(x: str):
    return regex.sub(r'\p{Mn}', '', unicodedata.normalize('NFKD', x))

### 2b. Primary Function: `replace_with_accent_removed`

Uses `remove_accent_chars_regex` to remove diacritical marks from a specifed column. This function will replace the values of the column with the cleaned values. 

In [7]:
def replace_with_accent_removed(table, col):
    cleaned_names = []
    for name in table[col]:
        cleaned = remove_accent_chars_regex(name)
        cleaned_names.append(cleaned)
        
    # replace with cleaned values
    table[col] = cleaned_names
        
    return table

## 3. `print_last_name_exceptions`: check that last name will split in a predictable way
I cannot check every name individually so this function will extract out the unusual names. I will use the features of these unusual names to ensure my next function correctly extracts first and last names of players. 

In [8]:
def print_last_name_exceptions(table, column_extracting_info_from):

    for row in table[column_extracting_info_from]:

        # raw string with "Jr." suffix removed
        player_gibberish = row.replace(" Jr.", "")
        
        # string split at first space only
        player_gibberish = player_gibberish.split(' ',1)

        # extract out row number
        row_num = ''.join(filter(str.isdigit, player_gibberish[0]))

        # remove row number from end of last element
        lastname_lastname_position = player_gibberish[-1].split(row_num)[0]

        # list out all positions
        positions = ['1B', '2B', '3B', 'RF', 'LF', 'CF', 'P', 'C', 'DH', 'SS', 'X']

        # loop through positions
        for position in positions:

            if position in lastname_lastname_position[-2:]:

                # duplicated last name
                last_last = lastname_lastname_position[:-len(position)]

                # remove second half
                half = len(last_last)/2

                # check - print names with exceptions
                last = last_last[int(half):]
                
                if lastname_lastname_position != last+last+position:                    
                    print(row)
                    print(lastname_lastname_position)
                    print()

                break

## 4. `first_last_position`: extract out first name, last name, and position
This function will extract the first name, last name, and position from a very gibberish looking string then add new columns for each. 

In [9]:
def first_last_position(table, column_extracting_info_from):
    
    ############################################################
    # Setup                                                    # 
    ############################################################
    
    # initialize empty list to hold values
    player_position = []
    player_last = []
    player_first = []
    
    # list out all positions
    positions = ['1B', '2B', '3B', 'RF', 'LF', 'CF', 'P', 'C', 'DH', 'SS', 'X']
    
    
    ############################################################
    # Loop through rows in table                               # 
    ############################################################
    
    for row in table[column_extracting_info_from]:
        
        # raw string with "Jr." suffix removed
        player_gibberish = row.replace(" Jr.", "")
    
        # determine if there is still a '.' (this would be from a middle initial)
        if '.' in player_gibberish:
            # split at all spaces
            player_gibberish = player_gibberish.split()

        else:
            # only split at first space
            player_gibberish = player_gibberish.split(' ',1)

        # extract out row number
        row_num = ''.join(filter(str.isdigit, player_gibberish[0]))

        ########################
        # First Name           #
        ########################

        # determine if number of elements more than 2
        if len(player_gibberish) > 2:
            # concat first two elements with a space between
            player_gibberish = [player_gibberish[0]+' '+player_gibberish[1],\
                                player_gibberish[2]]    

        # remove row number from front of first element
        firstname_firstinitial = player_gibberish[0].split(row_num)[1]

        # remove first initial from end of first element
        first = firstname_firstinitial[:-1]  

        # append first name to list
        player_first.append(first)


        ########################
        # Last Name & Position #
        ########################

        # remove row number from end last element
        lastname_lastname_position = player_gibberish[-1].split(row_num)[0]


        # loop through positions
        for position in positions:

            if position in lastname_lastname_position[-2:]:
                # append position to list
                player_position.append(position)

                # remove position to get duplicated last name
                last_last = lastname_lastname_position[:-len(position)]

                # remove second half
                half = len(last_last)/2

                # select half of `last_last`
                last = last_last[int(half):]

                # append last name to list
                player_last.append(last)

                # once you have the position and last name assignment break out of 
                # positions loop
                break


    
    
    ############################################################
    # add new columns to table fed into function               #
    ############################################################
    
    table['First Name'] = player_first
    table['Last Name'] = player_last
    table['Position'] = player_position

    return table


# Create Tables

## American League

### Create Table

In [10]:
american_league_table = create_table(american_league_url, 50)

# check
american_league_table

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
0,1OttoO LopezLopezSS1‌‌‌,TOR,8,9,0,6,0,0,0,3,1,1,0,1,0.667,0.700,0.667,1.367
1,2MattM CarpenterCarpenterDH2‌‌‌,NYY,47,128,28,39,9,0,15,37,19,35,0,0,0.305,0.412,0.727,1.139
2,3AaronA JudgeJudgeCF3‌‌‌,NYY,157,570,133,177,28,0,62,131,111,175,16,3,0.311,0.425,0.686,1.111
3,4DavidD HensleyHensleyDH4‌‌‌,HOU,16,29,7,10,2,1,1,5,5,6,0,0,0.345,0.441,0.586,1.027
4,5ConnerC CapelCapelRF5‌‌‌,OAK,13,35,6,13,0,1,2,9,4,8,1,1,0.371,0.425,0.600,1.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,349DJD StewartStewartX349‌‌‌,BAL,3,3,0,0,0,0,0,0,0,2,0,0,0.000,0.000,0.000,0.000
407,349DillonD TateTateP349‌‌‌,BAL,67,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000
408,349CalebC ThielbarThielbarP349‌‌‌,MIN,67,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000
409,349DonovanD WaltonWalton3B349‌‌‌,SEA,1,0,1,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000


### Check for unusually splitting names

In [11]:
print_last_name_exceptions(american_league_table, 'PLAYER')

17JoseJ RamírezRamirez3B17‌‌‌
RamírezRamirez3B

19EloyE JiménezJimenezDH19‌‌‌
JiménezJimenezDH

21JulioJ RodríguezRodriguezCF21‌‌‌
RodríguezRodriguezCF

31YandyY DíazDiaz3B31‌‌‌
DíazDiaz3B

39TeoscarT HernándezHernandezRF39‌‌‌
HernándezHernandezRF

45EugenioE SuárezSuarez3B45‌‌‌
SuárezSuarez3B

70AdolisA GarcíaGarciaRF70‌‌‌
GarcíaGarciaRF

77HaroldH RamírezRamirezDH77‌‌‌
RamírezRamirezDH

103RamonR UríasUrias3B103‌‌‌
UríasUrias3B

107JeremyJ PeñaPenaSS107‌‌‌
PeñaPenaSS

109ChristianC VázquezVazquezC109‌‌‌
VázquezVazquezC

128AledmysA DíazDiazLF128‌‌‌
DíazDiazLF

151JavierJ BáezBaezSS151‌‌‌
BáezBaezSS

153Michael A.M TaylorTaylorCF153‌‌‌
A.M TaylorTaylorCF

178FranciscoF MejíaMejiaC178‌‌‌
MejíaMejiaC

198EnriqueK HernándezHernandezCF198‌‌‌
HernándezHernandezCF

228VimaelV MachínMachin3B228‌‌‌
MachínMachin3B

259Josh H.J SmithSmith3B259‌‌‌
H.J SmithSmith3B

261CarlosC PérezPerezC261‌‌‌
PérezPerezC

266AndyA IbáñezIbanez3B266‌‌‌
IbáñezIbanez3B

268MauricioM DubónDubonCF268‌‌‌
DubónDubonCF

<div class="alert alert-block alert-info">
<b>NOTES:</b> Most names have diacritical marks that need to be removed; the `replace_with_accent_removed` function will remove these and change the values of table. I will update and reasses. 
</div>

### Replace the names with cleaned names

In [12]:
american_league_table = replace_with_accent_removed(american_league_table, 'PLAYER')

# check again
print_last_name_exceptions(american_league_table, 'PLAYER')

153Michael A.M TaylorTaylorCF153‌‌‌
A.M TaylorTaylorCF

259Josh H.J SmithSmith3B259‌‌‌
H.J SmithSmith3B



<div class="alert alert-block alert-info">
<b>NOTES:</b> The remaining names have a middle initial, this is already accounted for in the `first_last_position` function. So far I do not predict any issues. 
</div>

### Extract out first, last, and position

In [13]:
american_league_table = first_last_position(american_league_table, 'PLAYER')

# check
american_league_table

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,OPS,First Name,Last Name,Position
0,1OttoO LopezLopezSS1‌‌‌,TOR,8,9,0,6,0,0,0,3,...,1,0,1,0.667,0.700,0.667,1.367,Otto,Lopez,SS
1,2MattM CarpenterCarpenterDH2‌‌‌,NYY,47,128,28,39,9,0,15,37,...,35,0,0,0.305,0.412,0.727,1.139,Matt,Carpenter,DH
2,3AaronA JudgeJudgeCF3‌‌‌,NYY,157,570,133,177,28,0,62,131,...,175,16,3,0.311,0.425,0.686,1.111,Aaron,Judge,CF
3,4DavidD HensleyHensleyDH4‌‌‌,HOU,16,29,7,10,2,1,1,5,...,6,0,0,0.345,0.441,0.586,1.027,David,Hensley,DH
4,5ConnerC CapelCapelRF5‌‌‌,OAK,13,35,6,13,0,1,2,9,...,8,1,1,0.371,0.425,0.600,1.025,Conner,Capel,RF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,349DJD StewartStewartX349‌‌‌,BAL,3,3,0,0,0,0,0,0,...,2,0,0,0.000,0.000,0.000,0.000,DJ,Stewart,X
407,349DillonD TateTateP349‌‌‌,BAL,67,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Dillon,Tate,P
408,349CalebC ThielbarThielbarP349‌‌‌,MIN,67,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Caleb,Thielbar,P
409,349DonovanD WaltonWalton3B349‌‌‌,SEA,1,0,1,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Donovan,Walton,3B


<div class="alert alert-block alert-success">
<b>Success:</b> Three columns have been added. Ready to move on to National League. 
</div>

## National League

### Create Table

In [14]:
national_league_table = create_table(national_league_url, 50)

# check
national_league_table

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
0,1KhalilK LeeLeeCF1‌‌‌,NYM,2,2,1,1,0,0,1,3,0,0,0,0,0.500,0.500,2.000,2.500
1,2ChadwickC TrompTrompC2‌‌‌,ATL,1,4,0,3,2,0,0,3,0,0,0,0,0.750,0.750,1.250,2.000
2,3JamesJ OutmanOutmanLF3‌‌‌,LAD,4,13,6,6,2,0,1,3,2,7,0,0,0.462,0.563,0.846,1.409
3,4JoeJ DunandDunand2B4‌‌‌,MIA,3,10,2,3,1,0,1,1,0,3,0,0,0.300,0.364,0.700,1.064
4,5EvanE PhillipsPhillipsP5‌‌‌,LAD,64,0,0,0,0,0,0,0,1,0,0,0,0.000,1.000,0.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,355Ka'aiK TomTomRF355‌‌‌,SF,1,1,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000
423,355LukeL WeaverWeaverP355‌‌‌,AZ,13,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000
424,355J.B.J WendelkenWendelkenP355‌‌‌,AZ,29,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000
425,355TrevorT WilliamsWilliamsP355‌‌‌,NYM,30,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000


### Check for unusually splitting names

In [15]:
print_last_name_exceptions(national_league_table, 'PLAYER')

24MichaelM Harris IIHarrisCF24‌‌‌
Harris IIHarrisCF

32Ji HwanJ BaeBaeCF32‌‌‌
HwanJ BaeBaeCF

60FranciscoF ÁlvarezAlvarezDH60‌‌‌
ÁlvarezAlvarezDH

79RonaldR AcuñaAcunaRF79‌‌‌
AcuñaAcunaRF

93LuisL UríasUrias3B93‌‌‌
UríasUrias3B

133LuisL GarcíaGarciaSS133‌‌‌
GarcíaGarciaSS

161JesusJ SánchezSanchezCF161‌‌‌
SánchezSanchezCF

179MiguelM AndújarAndujarDH179‌‌‌
AndújarAndujarDH

202EliasE DíazDiazC202‌‌‌
DíazDiazC

211MauricioM DubónDubonCF211‌‌‌
DubónDubonCF

221CesarC HernándezHernandez2B221‌‌‌
HernándezHernandez2B

229SergioS AlcántaraAlcantara3B229‌‌‌
AlcántaraAlcantara3B

241OmarO NarváezNarvaezC241‌‌‌
NarváezNarvaezC

256AvisailA GarcíaGarciaRF256‌‌‌
GarcíaGarciaRF

288LewinL DíazDiaz1B288‌‌‌
DíazDiaz1B

301ErikE GonzálezGonzalezSS301‌‌‌
GonzálezGonzalezSS

323Jose J BarreroBarreroSS323‌‌‌
J BarreroBarreroSS

327DomD NuñezNunezC327‌‌‌
NuñezNunezC

330MannyM PiñaPinaC330‌‌‌
PiñaPinaC

331RobinsonR CanóCano2B331‌‌‌
CanóCano2B

355AlexA ColoméColomeP355‌‌‌
ColoméColomeP

355EdwinE DíazD

<div class="alert alert-block alert-info">
<b>NOTES:</b> Same observation as above with american league, most names have diacritical marks that need to be removed. I will clean the names then reasses.
</div>

### Replace the names with cleaned names

In [16]:
national_league_table = replace_with_accent_removed(national_league_table, 'PLAYER')

# check again
print_last_name_exceptions(national_league_table, 'PLAYER')

24MichaelM Harris IIHarrisCF24‌‌‌
Harris IIHarrisCF

32Ji HwanJ BaeBaeCF32‌‌‌
HwanJ BaeBaeCF

323Jose J BarreroBarreroSS323‌‌‌
J BarreroBarreroSS



<div class="alert alert-block alert-warning">
<b>NOTES:</b> Unfortunatly this is not the same as above. These are not just middle initials so the current function cannot handle these values appropiately and I will have to replace these values individually before moving forward.
</div>

### Change unusual values individually

In [17]:
t = national_league_table
c = 'PLAYER'
harris_string = 'Harris II'
bae_string = 'Ji Hwan'
barrero_string = 'Barrero'

updated_values = []

for row in t[c]:
    if harris_string in row:
        updated_values.append('24MichaelM HarrisHarrisCF24')
    elif bae_string in row:
        updated_values.append('32Ji_HwanJ BaeBaeCF32')
    elif barrero_string in row:
        updated_values.append('323JoseJ BarreroBarreroSS323')
    else:
        updated_values.append(row)
        
national_league_table[c] = updated_values

# check again
print_last_name_exceptions(national_league_table, 'PLAYER')

<div class="alert alert-block alert-success">
<b>Success:</b> There are no names that printed, meaning all names will split appropiatly. I can continue to extract out names and position of players.  
</div>

### Extract out first, last, and position

In [18]:
national_league_table = first_last_position(national_league_table, 'PLAYER')

# check
national_league_table

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,OPS,First Name,Last Name,Position
0,1KhalilK LeeLeeCF1‌‌‌,NYM,2,2,1,1,0,0,1,3,...,0,0,0,0.500,0.500,2.000,2.500,Khalil,Lee,CF
1,2ChadwickC TrompTrompC2‌‌‌,ATL,1,4,0,3,2,0,0,3,...,0,0,0,0.750,0.750,1.250,2.000,Chadwick,Tromp,C
2,3JamesJ OutmanOutmanLF3‌‌‌,LAD,4,13,6,6,2,0,1,3,...,7,0,0,0.462,0.563,0.846,1.409,James,Outman,LF
3,4JoeJ DunandDunand2B4‌‌‌,MIA,3,10,2,3,1,0,1,1,...,3,0,0,0.300,0.364,0.700,1.064,Joe,Dunand,2B
4,5EvanE PhillipsPhillipsP5‌‌‌,LAD,64,0,0,0,0,0,0,0,...,0,0,0,0.000,1.000,0.000,1.000,Evan,Phillips,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,355Ka'aiK TomTomRF355‌‌‌,SF,1,1,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Ka'ai,Tom,RF
423,355LukeL WeaverWeaverP355‌‌‌,AZ,13,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Luke,Weaver,P
424,355J.B.J WendelkenWendelkenP355‌‌‌,AZ,29,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,J.B.,Wendelken,P
425,355TrevorT WilliamsWilliamsP355‌‌‌,NYM,30,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Trevor,Williams,P


<div class="alert alert-block alert-success">
<b>Success:</b> The American and National league tables are ready to be concatenated.   
</div>

# Concat American and National League tables

In [19]:
MLB = pd.concat([american_league_table, national_league_table], ignore_index=True)
MLB

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,OPS,First Name,Last Name,Position
0,1OttoO LopezLopezSS1‌‌‌,TOR,8,9,0,6,0,0,0,3,...,1,0,1,0.667,0.700,0.667,1.367,Otto,Lopez,SS
1,2MattM CarpenterCarpenterDH2‌‌‌,NYY,47,128,28,39,9,0,15,37,...,35,0,0,0.305,0.412,0.727,1.139,Matt,Carpenter,DH
2,3AaronA JudgeJudgeCF3‌‌‌,NYY,157,570,133,177,28,0,62,131,...,175,16,3,0.311,0.425,0.686,1.111,Aaron,Judge,CF
3,4DavidD HensleyHensleyDH4‌‌‌,HOU,16,29,7,10,2,1,1,5,...,6,0,0,0.345,0.441,0.586,1.027,David,Hensley,DH
4,5ConnerC CapelCapelRF5‌‌‌,OAK,13,35,6,13,0,1,2,9,...,8,1,1,0.371,0.425,0.600,1.025,Conner,Capel,RF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,355Ka'aiK TomTomRF355‌‌‌,SF,1,1,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Ka'ai,Tom,RF
834,355LukeL WeaverWeaverP355‌‌‌,AZ,13,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Luke,Weaver,P
835,355J.B.J WendelkenWendelkenP355‌‌‌,AZ,29,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,J.B.,Wendelken,P
836,355TrevorT WilliamsWilliamsP355‌‌‌,NYM,30,0,0,0,0,0,0,0,...,0,0,0,0.000,0.000,0.000,0.000,Trevor,Williams,P


<div class="alert alert-block alert-info">
<b>NOTES:</b> Column names are not reader-friendly. I will scrap webpage to create a legend-dictionary to convert the columns to something more understandable. 
</div>

# Fix Column Names

## Create legend dictionary to rename columns

In [20]:
# make request
response = requests.get(american_league_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# find the div table from website
table = soup.find('div', class_ = 'stats-body-table player')

# epmpty dictionary to store key value pairs
legend_dict = {}

# loop through all the buttons in `table`
for button in table.find_all('button'):
    # get the text of the button
    key = button.get_text().replace('caret-upcaret-down', '')
    # get the description, cut off 'Column Sort'
    value = button['aria-label'].replace(' Column Sort', '') 
    
    legend_dict[key] = value # create and store key value pair

# view dictionary
legend_dict

{'PLAYER': 'Player',
 'TEAM': 'Team',
 'G': 'Games Played',
 'AB': 'At Bats',
 'R': 'Runs',
 'H': 'Hits',
 '2B': 'Doubles',
 '3B': 'Triples',
 'HR': 'Home Runs',
 'RBI': 'Runs Batted In',
 'BB': 'Walks',
 'SO': 'Strikeouts',
 'SB': 'Stolen Bases',
 'CS': 'Caught Stealing',
 'AVG': 'Batting Average',
 'OBP': 'On-Base Percentage',
 'SLG': 'Slugging Percentage',
 'OPS': 'On-Base Plus Slugging',
 '1': 'page 1 button',
 '2': 'page 2 button',
 '3': 'page 3 button',
 '4': 'page 4 button',
 '5': 'page 5 button',
 '...': 'page ... button',
 '17': 'page 17 button',
 'Next': 'next page button'}

<div class="alert alert-block alert-info">
<b>NOTES:</b> Use this dictionary to give more reader-friendly column names. 
</div>

## Rename columns and drop `Player` column

In [21]:
MLB.rename(columns = legend_dict, inplace = True)
MLB.drop('Player', axis=1, inplace = True)
MLB.head()

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,First Name,Last Name,Position
0,TOR,8,9,0,6,0,0,0,3,1,1,0,1,0.667,0.7,0.667,1.367,Otto,Lopez,SS
1,NYY,47,128,28,39,9,0,15,37,19,35,0,0,0.305,0.412,0.727,1.139,Matt,Carpenter,DH
2,NYY,157,570,133,177,28,0,62,131,111,175,16,3,0.311,0.425,0.686,1.111,Aaron,Judge,CF
3,HOU,16,29,7,10,2,1,1,5,5,6,0,0,0.345,0.441,0.586,1.027,David,Hensley,DH
4,OAK,13,35,6,13,0,1,2,9,4,8,1,1,0.371,0.425,0.6,1.025,Conner,Capel,RF


<div class="alert alert-block alert-success">
<b>Success:</b> This df is ready to be pickled. 
</div>

# Pickle MLB df

In [22]:
pd.to_pickle(MLB, "./MLB_df.pkl")