# MLB All-Star Game Television Viewership Analysis

In [1]:
# imports
import numpy as np
import os.path as osPath
import pandas as pd

from icecream import ic

import helpers.data_utils as DataHelper

Invoking __init__.py for helpers


In [2]:
# get data from csv file
filename = 'all_star_game_tv_stats.csv'
path = osPath.join('data', 'baseball_almanac')
df = DataHelper.get_csv(filename, path)



------------------------------------------------
Filepath: data/baseball_almanac/all_star_game_tv_stats.csv
Dataframe shape: (55, 6)
Data types...
Year | ASG      int64
Network        object
Rating        float64
Share           int64
Households     object
Viewers        object
dtype: object


In [3]:
# update column names
update_col_names = {
    'Year | ASG': 'Year',
    'Households': 'HouseholdViewers'
}

df.rename(columns=update_col_names, inplace=True)
print(df.columns)

Index(['Year', 'Network', 'Rating', 'Share', 'HouseholdViewers', 'Viewers'], dtype='object')


In [4]:
# remove unneeded columns
remove_cols = [
    'Network',
    'Viewers'
]

df.drop(columns=remove_cols, inplace=True)
print(df.columns)

Index(['Year', 'Rating', 'Share', 'HouseholdViewers'], dtype='object')


In [5]:
# update data types
df['HouseholdViewers'] = pd.to_numeric(df['HouseholdViewers'].str.replace(',', ''))
print(df.dtypes)

Year                  int64
Rating              float64
Share                 int64
HouseholdViewers      int64
dtype: object


In [6]:
def get_moving_range(dataframe, column: str, window_size: int, calculation: str) -> list:
    """ Function to get moving/rolling range for given dataframe series.

    Parameters
    ----------
    dataframe : pd.DataFrame
        Pandas dataframe containing target series.
    column : str
        Column name of target series.
    window_size : int
        Size of window for rolling range calculation.
    calculation : str
        Calculation to be used: min, max, or mean.

    Returns
    -------
    _clean_windows : list
        List of rolling ranges with given calculation applied.

    Raises
    ------
    ValueError
        If window size is less than two or more than given series length.
    ValueError
        If range calculation is something other than min, max, or mean.
    """
    _arr = dataframe[column]
    _len_series = len(_arr)
    _calc = calculation
    _winsize = window_size
    _valid_calcs = ['mean', 'min', 'max']
    _clean_windows = []
    
    if _winsize < 2 or _winsize >= _len_series:
        raise ValueError(f'Window size must be between 2 and {_len_series}...')

    if _calc == 'mean':
        _dirty_windows = pd.Series(_arr).rolling(_winsize).mean()
    elif _calc == 'min':
        _dirty_windows = pd.Series(_arr).rolling(_winsize).min()
    elif _calc == 'max':
        _dirty_windows = pd.Series(_arr).rolling(_winsize).max()
    else:
        raise ValueError(f'Invalid range calculation: {_calc}. Value must be in: {_valid_calcs}...')

    for _win in _dirty_windows.tolist():
        if not np.isnan(_win):
            if _calc in ['min', 'max']:
                _win = int(_win)
            else:
                _win = round(_win, 2)
            _clean_windows.append(_win)
    
    return _clean_windows

In [35]:
# get date range for moving averages
window_size = 10

min_years = get_moving_range(df, 'Year', window_size, calculation='min')
max_years = get_moving_range(df, 'Year', window_size, calculation='max')

min_max_years = []
for min, max in zip(min_years, max_years):
    min_max_years.append(f'{min}-{max}')

['1967-1976', '1968-1977', '1969-1978', '1970-1979', '1971-1980', '1972-1981', '1973-1982', '1974-1983', '1975-1984', '1976-1985', '1977-1986', '1978-1987', '1979-1988', '1980-1989', '1981-1990', '1982-1991', '1983-1992', '1984-1993', '1985-1994', '1986-1995', '1987-1996', '1988-1997', '1989-1998', '1990-1999', '1991-2000', '1992-2001', '1993-2002', '1994-2003', '1995-2004', '1996-2005', '1997-2006', '1998-2007', '1999-2008', '2000-2009', '2001-2010', '2002-2011', '2003-2012', '2004-2013', '2005-2014', '2006-2015', '2007-2016', '2008-2017', '2009-2018', '2010-2019', '2011-2021', '2012-2022']


In [38]:
# get moving average for df metrics
window_size = 10

moving_avg_rating = get_moving_range(df, 'Rating', window_size, calculation='mean')
moving_avg_share = get_moving_range(df, 'Share', window_size, calculation='mean')
moving_avg_viewers = get_moving_range(df, 'HouseholdViewers', window_size, calculation='mean')

In [36]:
def get_first_last(input_list: list) -> list:
    """ Returns first and last element of input_list as new list.

    Parameters
    ----------
    input_list : list
        List containing target elements.

    Returns
    -------
    _first_last : list
        New list containing first and last element of input_list.
    """
    _first_last = []
    for i in [0, -1]:
        _first_last.append(input_list[i])
    return _first_last

In [None]:
def get_percent_change(input_list: list) -> list:
    """ Returns percent change between first and last element of input_list.
    
    Returns a list with the first element being NaN and the last element being the percent change.

    Parameters
    ----------
    input_list : list
        List for which percent change should be calculated.

    Returns
    -------
    _pct_change : list
        List with NaN and delta between first and last element of input_list.
    """
    _delta = round(((input_list[0] - input_list[-1]) / input_list[0]), 2)
    _pct_change = [np.nan, _delta]
    return _pct_change

In [37]:
# create dataframe with first and last records
data = {
    'Years': get_first_last(min_max_years),
    'AvgRating': get_first_last(moving_avg_rating),
    'RatingDelta': get_percent_change(moving_avg_rating),
    'AvgShare': get_first_last(moving_avg_share),
    'ShareDelta': get_percent_change(moving_avg_share),
    'AvgHouseholdViewers': get_first_last(moving_avg_viewers),
    'ViewersDelta': get_percent_change(moving_avg_viewers)
}

first_last_df = pd.DataFrame(data)
first_last_df

Unnamed: 0,Years,AvgRating,RatingDelta,AvgShare,ShareDelta,AvgHouseholdViewers,ViewersDelta
0,1967-1976,24.07,,47.1,,14855000.0,
1,2012-2022,5.71,0.76,11.0,0.77,7172572.4,0.52


After reviewing television viewing data from MLB All-Star games we observe a massive decrease in viewership. From the first time period in this dataset (1967-1976) to the most recent available time period (2012-2022); the average rating for All-Star games dropped by 76%, the average share of the viewership market decreased by 77%, and the number of average household viewers fell by 52%.