In [1]:
import pandas as pd
import requests

# all postseason data 1885-2021
tables = pd.read_html(f'https://www.baseball-reference.com/postseason/')
df_postseason = tables[0]

In [2]:
# drop any columns that have missing data
df_postseason = df_postseason.dropna(axis=0, how='any')
df_postseason = df_postseason.dropna(axis=1, how='any')

In [3]:
# split the series column into year and series name

# split the data
new = df_postseason['Series'].str.split(" ", n = 1, expand = True)

# drop the old series column
df_postseason2 = df_postseason.drop(columns = ['Series'])

# add the split columns to the dataframe
df_postseason2 = pd.concat([new, df_postseason2], axis=1)

# rename the columns
dict_rename = {0: 'Season',
               1: 'Series',
              'Unnamed: 1': 'Final_Score',
              'Unnamed: 2': 'Teams_Played'}
df_postseason2 = df_postseason2.rename(dict_rename, axis = 1)

# update the seasons to integers
df_postseason2['Season'] = df_postseason2['Season'].astype(int)

In [4]:
# split the Teams_Played column into the two teams that played

new2 = df_postseason2['Teams_Played'].str.split(".", n=1, expand=True)

# drop the teams_played column
df_postseason3 = df_postseason2.drop(columns = ['Teams_Played'])

# add the split columns to the dataframe
df_postseason3 = pd.concat([df_postseason3, new2], axis=1)

# rename the columns
dict_rename2 = {0: 'W_Team',
               1: 'L_Team'}
df_postseason3 = df_postseason3.rename(dict_rename2, axis=1)

In [5]:
# for the Winning_Team and Losing_Team columns, update the teams

def clean_team(team_str):
    """ clean the team string
        
    Args:
        team_str (str): the initial string representing the team
        
    Returns:
        team_name (str): the cleaned string
    """
    
    team_list = str(team_str).split('(')
    
    clean_team = team_list[0]
    
    clean_team = clean_team.strip()
    
    if clean_team.endswith('*'):
        clean_team = clean_team[:-1]
    
    return clean_team

In [6]:
df_postseason3['Winning_Team'] = df_postseason3['W_Team'].map(clean_team)
df_postseason3['Losing_Team'] = df_postseason3['L_Team'].map(clean_team)

In [7]:
# drop the old winning and losing team columns
df_postseason4 = df_postseason3.drop(columns = ['W_Team', 'L_Team'])

In [8]:
df_postseason4 = df_postseason4.sort_values('Season', ascending=False)

In [9]:
# drop all of the rows before 1988 and after 2018
df_filtered1 = df_postseason4[df_postseason4['Season'] <= 2018] 
df_filtered2 = df_postseason4[df_postseason4['Season'] >= 1988]

df_postseason_final = pd.merge(df_filtered1, df_filtered2, how="inner")

df_postseason_final.head()
#df_postseason_final.to_csv('postseason_data.csv')

Unnamed: 0,Season,Series,Final_Score,Winning_Team,Losing_Team
0,2018,ALCS,4-1,Boston Red Sox,Houston Astros
1,2018,NLCS,4-3,Los Angeles Dodgers,Milwaukee Brewers
2,2018,ALDS1,3-1,Boston Red Sox,New York Yankees
3,2018,ALDS2,3-0,Houston Astros,Cleveland Indians
4,2018,NLDS2,3-1,Los Angeles Dodgers,Atlanta Braves


In [24]:
def get_league(series_str):
    """ determine the league from the series string
        
    Args:
        series_str (str): the series string
        
    Returns:
        league (str): the league of the series
    """
    if str(series_str).startswith('AL'):
        league = 'AL'
    elif str(series_str).startswith('NL'):
        league = 'NL'
    else:
        league = 'None'

    return league

In [25]:
df_postseason_final['League'] = df_postseason_final['Series'].map(get_league)

# reorder columns
df_postseason_final = df_postseason_final[["Season", "League", "Series", 'Final']]
df_postseason_final.head()

Unnamed: 0,Season,Series,Final_Score,Winning_Team,Losing_Team,League
0,2018,ALCS,4-1,Boston Red Sox,Houston Astros,AL
1,2018,NLCS,4-3,Los Angeles Dodgers,Milwaukee Brewers,NL
2,2018,ALDS1,3-1,Boston Red Sox,New York Yankees,AL
3,2018,ALDS2,3-0,Houston Astros,Cleveland Indians,AL
4,2018,NLDS2,3-1,Los Angeles Dodgers,Atlanta Braves,NL
