In [59]:
import pandas as pd
import requests

# all postseason data 1885-2021
tables = pd.read_html(f'https://www.baseball-reference.com/postseason/')
df_postseason = tables[0]

In [60]:
# drop any columns that have missing data
df_postseason = df_postseason.dropna(axis=0, how='any')
df_postseason = df_postseason.dropna(axis=1, how='any')

In [61]:
#  get rid of future seasons
df_postseason = df_postseason.iloc[:-4, :]

# get rid of years where postseason was not played
df_postseason = df_postseason[df_postseason["Unnamed: 2"].str.contains("No Postseason played this year.")==False]

In [62]:
# split the series column into year and series name

# split the data
new = df_postseason['Series'].str.split(" ", n = 1, expand = True)

# drop the old series column
df_postseason2 = df_postseason.drop(columns = ['Series'])

# add the split columns to the dataframe
df_postseason2 = pd.concat([new, df_postseason2], axis=1)

# rename the columns
dict_rename = {0: 'Season',
               1: 'Series',
              'Unnamed: 1': 'Final_Score',
              'Unnamed: 2': 'Teams_Played'}
df_postseason2 = df_postseason2.rename(dict_rename, axis = 1)

# update the seasons to integers
df_postseason2['Season'] = df_postseason2['Season'].astype(int)
df_postseason2.head()

Unnamed: 0,Season,Series,Final_Score,Teams_Played
0,2021,World Series,4-2,"Atlanta Braves (88-73, NL) vs. Houston Astros ..."
1,2021,ALCS,4-2,"Houston Astros (95-67, AL) vs. Boston Red Sox*..."
2,2021,NLCS,4-2,"Atlanta Braves (88-73, NL) vs. Los Angeles Dod..."
3,2021,ALDS1,3-1,"Houston Astros (95-67, AL) vs. Chicago White S..."
4,2021,ALDS2,3-1,"Boston Red Sox* (92-70, AL) vs. Tampa Bay Rays..."


In [68]:
new3 = df_postseason2['Teams_Played'].str.split('(')

final = []

for row in new3:
    win_team = row[0].strip()
    
    lose_team = row[1].split('.', 1)[1].strip()
    
    updated_row = [win_team, lose_team]
    final.append(updated_row)
    
final

[['Atlanta Braves', 'Houston Astros'],
 ['Houston Astros', 'Boston Red Sox*'],
 ['Atlanta Braves', 'Los Angeles Dodgers*'],
 ['Houston Astros', 'Chicago White Sox'],
 ['Boston Red Sox*', 'Tampa Bay Rays'],
 ['Atlanta Braves', 'Milwaukee Brewers'],
 ['Los Angeles Dodgers*', 'San Francisco Giants'],
 ['Boston Red Sox*', 'New York Yankees*'],
 ['Los Angeles Dodgers*', 'St. Louis Cardinals*'],
 ['Los Angeles Dodgers', 'Tampa Bay Rays'],
 ['Tampa Bay Rays', 'Houston Astros*'],
 ['Los Angeles Dodgers', 'Atlanta Braves'],
 ['Tampa Bay Rays', 'New York Yankees*'],
 ['Houston Astros*', 'Oakland Athletics'],
 ['Los Angeles Dodgers', 'San Diego Padres*'],
 ['Atlanta Braves', 'Miami Marlins*'],
 ['Tampa Bay Rays', 'Toronto Blue Jays*'],
 ['Los Angeles Dodgers', 'Milwaukee Brewers*'],
 ['New York Yankees*', 'Cleveland Indians*'],
 ['San Diego Padres*', 'St. Louis Cardinals*'],
 ['Houston Astros*', 'Minnesota Twins'],
 ['Miami Marlins*', 'Chicago Cubs'],
 ['Oakland Athletics', 'Chicago White Sox*'],

In [None]:
final_list = []
list1 = ['a', 'b']
final_list.append(list1)
final_list

In [None]:
new2 = df_postseason2['Teams_Played'].str.split(".", n=1, expand=True)
new2

In [None]:
# split the Teams_Played column into the two teams that played

new2 = df_postseason2['Teams_Played'].str.split(".", n=1, expand=True)

# drop the teams_played column
df_postseason3 = df_postseason2.drop(columns = ['Teams_Played'])

# add the split columns to the dataframe
df_postseason3 = pd.concat([df_postseason3, new2], axis=1)

# rename the columns
dict_rename2 = {0: 'W_Team',
               1: 'L_Team'}
df_postseason3 = df_postseason3.rename(dict_rename2, axis=1)

In [None]:
# split the Teams_Played column into the two teams that played

new2 = df_postseason2['Teams_Played'].str.split(".", n=1, expand=True)

# drop the teams_played column
df_postseason3 = df_postseason2.drop(columns = ['Teams_Played'])

# add the split columns to the dataframe
df_postseason3 = pd.concat([df_postseason3, new2], axis=1)

# rename the columns
dict_rename2 = {0: 'W_Team',
               1: 'L_Team'}
df_postseason3 = df_postseason3.rename(dict_rename2, axis=1)

In [None]:
# for the Winning_Team and Losing_Team columns, update the teams

def clean_team(team_str):
    """ clean the team string
        
    Args:
        team_str (str): the initial string representing the team
        
    Returns:
        team_name (str): the cleaned string
    """
    
    team_list = str(team_str).split('(')
    
    clean_team = team_list[0]
    
    clean_team = clean_team.strip()
    
    if clean_team.endswith('*'):
        clean_team = clean_team[:-1]
    
    return clean_team

In [None]:
df_postseason3['Winning_Team'] = df_postseason3['W_Team'].map(clean_team)
df_postseason3['Losing_Team'] = df_postseason3['L_Team'].map(clean_team)

In [None]:
# drop the old winning and losing team columns
df_postseason4 = df_postseason3.drop(columns = ['W_Team', 'L_Team'])

In [None]:
df_postseason4 = df_postseason4.sort_values('Season', ascending=False)

In [None]:
# drop all of the rows before 1988 and after 2018
df_filtered1 = df_postseason4[df_postseason4['Season'] <= 2018] 
df_filtered2 = df_postseason4[df_postseason4['Season'] >= 1988]

df_postseason_final = pd.merge(df_filtered1, df_filtered2, how="inner")

df_postseason_final.head()
#df_postseason_final.to_csv('postseason_data.csv')

In [None]:
def get_league(series_str):
    """ determine the league from the series string
        
    Args:
        series_str (str): the series string
        
    Returns:
        league (str): the league of the series
    """
    if str(series_str).startswith('AL'):
        league = 'AL'
    elif str(series_str).startswith('NL'):
        league = 'NL'
    else:
        league = 'None'

    return league

In [None]:
df_postseason_final['League'] = df_postseason_final['Series'].map(get_league)

# reorder columns
df_postseason_final = df_postseason_final[["Season", "League", "Series", 
                                           'Final_Score', 'Winning_Team', 'Losing_Team']]
df_postseason_final.head()

In [None]:
# get the series played
def get_series(series_str):
    """ determine the series from the series string
        
    Args:
        series_str (str): the series string
        
    Returns:
        series (str): the series played
    """
    if str(series_str).startswith('AL'):
        series = series_str[len('AL'):]
    elif str(series_str).startswith('NL'):
        series = series_str[len('NL'):]
    else:
        series = 'World Series'
        
        
    # Championship Series, Division Series 1 and 2, Wild Card, World Series
    if series == 'CS':
        series = 'Championship'
    elif series == 'DS1':
        series = 'Division1'
    elif series == 'DS2':
        series = 'Division2'
    elif series == 'WC':
        series = 'Wildcard'
    else:
        series = 'WorldSeries'

    return series

In [None]:
df_postseason_final['Series_Played'] = df_postseason_final['Series'].map(get_series)

In [None]:
# reorder columns
# reorder columns
df_postseason_final = df_postseason_final[["Season", "League", "Series_Played", 
                                           'Final_Score', 'Winning_Team', 'Losing_Team']]
df_postseason_final.head()

In [None]:
# determine if the winning team is in the AL or NL
def determine_league(winning_team):
    """ determine if the winning team is in the AL (true) or NL (false)
    
    Args:
        winnning_team (str) : the winning team, as a string
    
    Returns:
        in_al (Boolean) : True if the team is in the AL, or False if the team is in the NL
    """
    
    AL_teams = ['Baltimore Orioles', 'Boston Red Sox','New York Yankees', 'Tampa Bay Rays',
                'Toronto Blue Jays', 'Chicago White Sox', 'Cleveland Indians', 'Detroit Tigers',
                'Kansas City Royals', 'Minnesota Twins', 'Houston Astros', 'Los Angeles Angels',
                'Oakland Athletics', 'Seattle Mariners', 'Texas Rangers']
    
    return winning_team in AL_teams

In [None]:
#df_postseason_final.to_csv('postseason_data.csv')