In [11]:
# imports
import pandas as pd
import numpy as np
import os

In [2]:
# list of features (total)
feat = [
    # home team features
    'HRk: previous season home team rank', 'HTeam: home team name', 
    'HAvAge: previous season home team average age', 'HGP: previous season home team games played', 
    'HW: previous season home team wins', 'HL: previous season home team losses', 
    'HOL: previous season home team overtime losses', 'HPTS: previous season home team points', 
    'HPTS%: previous season home team points percentage', 'HGF: previous season home team goals for', 
    'HGA: previous season home team goals against', 'HSOW: previous season home team shootout wins', 
    'HSOL: previous season home team shootout losses', 'HSRS: previous season home team simple rating system', 
    'HSOS: previous season home team strength of schedule', 'HGF/G: previous season home team goals for per game', 
    'HGA/G: previous season home team goals against per game', 'HPP: previous season home team power play goals', 
    'HPPO: previous season home team power play opportunities', 'HPP%: previous season home team power play percentage', 
    'HPPA: previous season home team power play goals against', 'HPPOA: previous season home team power play opportunities against', 
    'HPK%: previous season home team penalty kill percentage', 'HSH: previous season home team short-handed goals', 
    'HSHA: previous season home team short-handed goals against', 'HPIM/G: previous season home team penalty minutes per game', 
    'HoPIM/G: previous season home team opponent penalty minutes per game', 'HS: previous season home team shots', 
    'HS%: previous season home team shooting percentage', 'HSA: previous season home team shots against', 
    'HSV%: previous season home team save percentage', 'HSO: previous season home team shutouts',
    'HCAN: dummy variable for canadian home team',

    # visiting team features
    'VRk: previous season visitor team rank', 'VTeam: visitor team name', 
    'VAvAge: previous season visitor team average age', 'VGP: previous season visitor team games played', 
    'VW: previous season visitor team wins', 'VL: previous season visitor team losses', 
    'VOL: previous season visitor team overtime losses', 'VPTS: previous season visitor team points', 
    'VPTS%: previous season visitor team points percentage', 'VGF: previous season visitor team goals for', 
    'VGA: previous season visitor team goals against', 'VSOW: previous season visitor team shootout wins', 
    'VSOL: previous season visitor team shootout losses', 'VSRS: previous season visitor team simple rating system', 
    'VSOS: previous season visitor team strength of schedule', 'VGF/G: previous season visitor team goals for per game', 
    'VGA/G: previous season visitor team goals against per game', 'VPP: previous season visitor team power play goals', 
    'VPPO: previous season visitor team power play opportunities', 'VPP%: previous season visitor team power play percentage', 
    'VPPA: previous season visitor team power play goals against', 'VPPOA: previous season visitor team power play opportunities against', 
    'VPK%: previous season visitor team penalty kill percentage', 'VSH: previous season visitor team short-handed goals', 
    'VSHA: previous season visitor team short-handed goals against', 'VPIM/G: previous season visitor team penalty minutes per game', 
    'VoPIM/G: previous season visitor team opponent penalty minutes per game', 'VS: previous season visitor team shots', 
    'VS%: previous season visitor team shooting percentage', 'VSA: previous season visitor team shots against', 
    'VSV%: previous season visitor team save percentage', 'VSO: previous season visitor team shutouts',
    'VCAN: dummy variable for canadian visiting team',

    # general features
    'DAY: dummy variable for weekend or weekday game',
    'TDAY: dummy variable for Tuesday game',
    'WDAY: dummy variable for Wednesday game',
    'ThDAY: dummy variable for Thursday game',
    'FDAY: dummy variable for Friday game',
    'SDAY: dummy variable for Saturday game',
    'SuDAY: dummy variable for Sunday game',
    'DIV: dummy variable for in-division game',
    'CAP: home team capacity',
    'LCAP: natural logarithm of home team capacity'
]

In [3]:
# teams
can_teams = ['Montreal Canadiens', 'Toronto Maple Leafs', 'Vancouver Canucks',
             'Calgary Flames', 'Edmonton Oilers', 'Ottawa Senators',
             'Winnipeg Jets']

'''
atl_div = ['Toronto Maple Leafs', 'Florida Panthers', 'Tampa Bay Lightning',
           'Detroit Red Wings', 'Ottowa Senators', 'Boston Bruins',
           'Montreal Canadiens', 'Buffalo Sabres']

met_div = ['Carolina Hurricanes', 'New Jersey Devils', 'New York Rangers',
           'New York Islanders', 'Pittsburgh Penguins', 'Washington Capitals',
           'Philadelphia Flyers', 'Columbus Blue Jackets']

cen_div = ['Colorado Avalanche', 'Dallas Stars', 'Winnipeg Jets',
           'Minnesota Wild', 'Nashville Predators', 'St. Louis Blues',
           'Arizona Coyotes', 'Chicago Blackhawks']

pac_div = ['Vegas Golden Knights', 'Edmonton Oilers', 'Los Angeles Kings',
           'Seattle Kraken', 'Calgary Flames', 'Vancouver Canucks',
           'San Jose Sharks', 'Anaheim Ducks']
'''

team_to_div = {
    'Toronto Maple Leafs': 'atl', 'Florida Panthers': 'atl', 'Tampa Bay Lightning': 'atl',
    'Detroit Red Wings': 'atl', 'Ottawa Senators': 'atl', 'Boston Bruins': 'atl',
    'Montreal Canadiens': 'atl', 'Buffalo Sabres': 'atl',
    
    'Carolina Hurricanes': 'met', 'New Jersey Devils': 'met', 'New York Rangers': 'met',
    'New York Islanders': 'met', 'Pittsburgh Penguins': 'met', 'Washington Capitals': 'met',
    'Philadelphia Flyers': 'met', 'Columbus Blue Jackets': 'met',
    
    'Colorado Avalanche': 'cen', 'Dallas Stars': 'cen', 'Winnipeg Jets': 'cen',
    'Minnesota Wild': 'cen', 'Nashville Predators': 'cen', 'St. Louis Blues': 'cen',
    'Arizona Coyotes': 'cen', 'Chicago Blackhawks': 'cen',
    
    'Vegas Golden Knights': 'pac', 'Edmonton Oilers': 'pac', 'Los Angeles Kings': 'pac',
    'Seattle Kraken': 'pac', 'Calgary Flames': 'pac', 'Vancouver Canucks': 'pac',
    'San Jose Sharks': 'pac', 'Anaheim Ducks': 'pac'
}

In [4]:
# BASE DATAFRAMES (PUT ALL SEASONS TOGETHER IN ONE DF
# first build df per season, then combine all seasons

# clean stats data
def clean_stat(df_stat_prev):
    '''
    clean statistics data so that it can be combined with the attendance data
    '''
    
    # --> convert to numeric
    df_stat_prev[['Rk', 'AvAge', 'GP', 'W', 'L', 'OL', 'PTS', 'PTS%', 'GF', 'GA',
           'SOW', 'SOL', 'SRS', 'SOS', 'GF/G', 'GA/G', 'PP', 'PPO', 'PP%', 'PPA',
           'PPOA', 'PK%', 'SH', 'SHA', 'PIM/G', 'oPIM/G', 'S', 'S%', 'SA', 'SV%',
           'SO']] = df_stat_prev[['Rk', 'AvAge', 'GP', 'W', 'L', 'OL', 'PTS', 'PTS%', 'GF', 'GA',
           'SOW', 'SOL', 'SRS', 'SOS', 'GF/G', 'GA/G', 'PP', 'PPO', 'PP%', 'PPA',
           'PPOA', 'PK%', 'SH', 'SHA', 'PIM/G', 'oPIM/G', 'S', 'S%', 'SA', 'SV%',
           'SO']].apply(pd.to_numeric, errors = 'coerce')

    return df_stat_prev

# clean attendance data
def clean_att(df_res):
    '''
    clean attendance data
    remove games with extreme outliers in attendance
    ensure attendance is numeric
    '''
    
    # make sure attendance is numeric
    df_res['Attendance'] = pd.to_numeric(df_res['Attendance'], errors = 'coerce')
    
    # games with notes (specialty games) --> remove these entries cause they'll just mess up the data
    df_notes = df_res[df_res['Notes'].notna() & (df_res['Notes'] != '')]
    df_res = df_res.drop(df_notes.index)
    
    # games with wack attendance
    # 2022 season still partly affected by COVID-19 --> remove these entries cause they'll just mess up the data
    df_weird_att = df_res[(df_res['Attendance'] >= 25000) | (df_res['Attendance'] <= 18000) | (df_res['Attendance'].isna())]
    df_res = df_res.drop(df_weird_att.index)
    
    # re-index
    df_res = df_res.reset_index(drop = True)

    return df_res

In [28]:
# combine stats and attendance data
def season_data(df_res, df_stat_prev, df_arena):
    '''
    combine attendance data with the cleaned statistics data from the previous season
    for full list of features, see [feat]
    df_res: current season results
        trying to predict attendance for current season
    df_stat_prev: previous season statistics
        use previous season statistics for prediction (with additional game day variables)
    df_arena: current season arena capacities
        add arena capacity (for current season) as variable
    returns combined df
    '''

    ###
    # make new df with desired variables
    df_vars = pd.DataFrame()
    
    # attendance
    df_vars['A'] = df_res['Attendance']
    
    # log attendance
    df_vars['LA'] = np.log(df_vars['A'])
    
    # home team
    df_vars['H'] = df_res['Home']
    
    # visiting team
    df_vars['V'] = df_res['Visitor']

    ### renaming of Phoenix Coyotes --> Arizona Coyotes
    if ('Arizona Coyotes' in df_vars['V'].values) and ('Arizona Coyotes' not in df_stat_prev['Team'].values):
        df_stat_prev['Team'] = df_stat_prev['Team'].replace('Phoenix Coyotes', 'Arizona Coyotes')

    ### home team variables
    df_vars = df_vars.join(
        df_stat_prev.set_index('Team').add_prefix('H'), 
        on = 'H'
    )

    # home team canadian team dummy (1 - canadian, 0 - otherwise)
    df_vars['HCAN'] = df_vars['H'].apply(lambda x: 1 if x in can_teams else 0)

    ### visiting team variables
    df_vars = df_vars.join(
        df_stat_prev.set_index('Team').add_prefix('V'), 
        on = 'V'
    )

    # visiting team canadian team dummy (1 - canadian, 0 - otherwise)
    df_vars['VCAN'] = df_vars['V'].apply(lambda x: 1 if x in can_teams else 0)


    '''
    haven't decided yet how to do this
    ### addition of Seattle Kraken
    if 'Seattle Kraken' not in df_stat_prev['Team'].values:
        df_stat_prev = pd.concat([df_stat_prev, pd.DataFrame({'Team': ['Seattle Kraken'], 'W': [0], 'L': [0]})])

    ### addition of Vegas Golden Knights
    if 'Vegas Golden Knights' not in df_stat_prev['Team'].values:
        df_stat_prev = pd.concat([df_stat_prev, pd.DataFrame({'Team': ['Vegas Golden Knights'], 'W': [0], 'L': [0]})])
    '''
    
    ### game variables
    # no monday dummy --> multicollinearity    
    # tuesday dummy (1 - tuesday, 0 - otherwise)
    df_vars['TDAY'] = df_res['Day'].isin(['Tuesday']).astype(int)
    
    # wednesday dummy (1 - wednesday, 0 - otherwise)
    df_vars['WDAY'] = df_res['Day'].isin(['Wednesday']).astype(int)
    
    # thursday dummy (1 - thursday, 0 - otherwise)
    df_vars['ThDAY'] = df_res['Day'].isin(['Thursday']).astype(int)
    
    # friday dummy (1 - friday, 0 - otherwise)
    df_vars['FDAY'] = df_res['Day'].isin(['Friday']).astype(int)
    
    # saturday dummy (1 - saturday, 0 - otherwise)
    df_vars['SDAY'] = df_res['Day'].isin(['Saturday']).astype(int)
    
    # sunday dummy (1 - sunday, 0 - otherwise)
    df_vars['SuDAY'] = df_res['Day'].isin(['Sunday']).astype(int)
    
    # in-division game
    df_vars['DIV'] = df_vars.apply(
        lambda row: 1 if team_to_div.get(row['H']) == team_to_div.get(row['V']) else 0, axis = 1
    )

    # capacity of home team
    df_vars['CAP'] = df_vars['H'].map(df_arena.set_index('Team')['Capacity'])

    # log capacity
    df_vars['LCAP'] = np.log(df_vars['CAP'])

    # percentage attendance
    df_vars['PA'] = df_vars['A'] / df_vars['CAP']

    # log percentage attendance
    df_vars['LPA'] = np.log(df_vars['PA'])

    # log attendance / log capacity
    df_vars['LA/LC'] = df_vars['LA'] / df_vars['LCAP']

    return df_vars

In [29]:
# load each season results (attendance), prev season stats, and season arenas
def load_and_store_df(year: int):
    '''
    loads season results, prev season stats, and season arenas
    calls function: season_data to combine into df
    returns combined df
    '''

    season_str = f'{year - 1}-{str(year)[2:]}'
    season_prev_str = f'{year - 2}-{str(year - 1)[2:]}'
    
    file_res = f'season results/{season_str} season results.csv'
    file_stat = f'season stats csv/{season_prev_str} season stats.csv'
    file_arena = f'season arenas/{season_str} season arenas.csv'

    df_res = pd.read_csv(file_res)
    df_prev_stat = pd.read_csv(file_stat)
    df_arena = pd.read_csv(file_arena)

    # preprocess
    df_res_clean = clean_att(df_res)
    df_stat_clean = clean_stat(df_prev_stat)

    df_vars = season_data(df_res_clean, df_stat_clean, df_arena)

    return df_vars

In [30]:
# combine df for each season into one giant df
years = [2024, 2023, 2022, 2019, 2018, 2017, 2016, 2015, 2014]

df_vars_total = pd.DataFrame()
for year in years:
    df_vars = load_and_store_df(year)
    df_vars_total = pd.concat([df_vars_total, df_vars], ignore_index = True)

In [31]:
# only rows that contain NaN values are the rows where its the first season the Kraken or Golden Knights joined the league
df_vars_total[df_vars_total.isnull().any(axis = 1)]

Unnamed: 0,A,LA,H,V,HRk,HAvAge,HGP,HW,HL,HOL,...,ThDAY,FDAY,SDAY,SuDAY,DIV,CAP,LCAP,PA,LPA,LA/LC
1388,18431.0,9.821789,Vegas Golden Knights,Seattle Kraken,1.0,29.2,56.0,40.0,14.0,2.0,...,0,0,0,0,1,17367.0,9.762327,1.061266,0.059462,1.006091
1441,18246.0,9.811701,Vegas Golden Knights,Seattle Kraken,1.0,29.2,56.0,40.0,14.0,2.0,...,0,0,0,0,1,17367.0,9.762327,1.050613,0.049374,1.005058
1481,19092.0,9.857025,Tampa Bay Lightning,Seattle Kraken,8.0,28.4,56.0,36.0,17.0,3.0,...,0,1,0,0,0,19092.0,9.857025,1.000000,0.000000,1.000000
1488,18963.0,9.850245,Detroit Red Wings,Seattle Kraken,27.0,29.3,56.0,19.0,27.0,10.0,...,0,0,0,0,0,19515.0,9.878939,0.971714,-0.028694,0.997095
1569,18228.0,9.810714,Pittsburgh Penguins,Seattle Kraken,7.0,28.4,56.0,37.0,16.0,3.0,...,1,0,0,0,0,18387.0,9.819399,0.991353,-0.008685,0.999116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2996,18191.0,9.808682,Vegas Golden Knights,St. Louis Blues,,,,,,,...,0,1,0,0,0,17367.0,9.762327,1.047446,0.046355,1.004748
3003,18458.0,9.823253,Vegas Golden Knights,San Jose Sharks,,,,,,,...,0,0,1,0,1,17367.0,9.762327,1.062820,0.060926,1.006241
3016,18865.0,9.845064,Vancouver Canucks,Vegas Golden Knights,29.0,28.8,82.0,30.0,43.0,9.0,...,0,0,0,0,1,18910.0,9.847446,0.997620,-0.002383,0.999758
3020,18347.0,9.817221,Edmonton Oilers,Vegas Golden Knights,8.0,26.7,82.0,47.0,26.0,9.0,...,1,0,0,0,1,18347.0,9.817221,1.000000,0.000000,1.000000


In [32]:
# for now: drop these values, since regressions will probably drop them anyways
# and I don't really know how else to deal with them lol
df_vars_total = df_vars_total.dropna()
df_vars_total

Unnamed: 0,A,LA,H,V,HRk,HAvAge,HGP,HW,HL,HOL,...,ThDAY,FDAY,SDAY,SuDAY,DIV,CAP,LCAP,PA,LPA,LA/LC
0,18411.0,9.820704,Pittsburgh Penguins,Chicago Blackhawks,20.0,31.0,82.0,40.0,31.0,11.0,...,0,0,0,0,0,18387.0,9.819399,1.001305,0.001304,1.000133
1,19092.0,9.857025,Tampa Bay Lightning,Nashville Predators,13.0,29.9,82.0,46.0,30.0,6.0,...,0,0,0,0,0,19092.0,9.857025,1.000000,0.000000,1.000000
2,18724.0,9.837561,Vegas Golden Knights,Seattle Kraken,4.0,29.1,82.0,51.0,22.0,9.0,...,0,0,0,0,1,17367.0,9.762327,1.078137,0.075234,1.007707
3,18893.0,9.846547,Carolina Hurricanes,Ottawa Senators,2.0,29.3,82.0,52.0,21.0,9.0,...,0,0,0,0,0,18700.0,9.836279,1.010321,0.010268,1.001044
4,18145.0,9.806150,Los Angeles Kings,Colorado Avalanche,10.0,28.7,82.0,47.0,25.0,10.0,...,0,0,0,0,0,18230.0,9.810824,0.995337,-0.004674,0.999524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5662,19727.0,9.889744,Philadelphia Flyers,Carolina Hurricanes,20.0,27.9,48.0,23.0,22.0,3.0,...,0,0,0,1,1,19538.0,9.880117,1.009673,0.009627,1.000974
5663,18663.0,9.834298,Pittsburgh Penguins,Ottawa Senators,2.0,29.0,48.0,36.0,12.0,0.0,...,0,0,0,1,0,18387.0,9.819399,1.015011,0.014899,1.001517
5664,18430.0,9.821735,St. Louis Blues,Detroit Red Wings,6.0,26.3,48.0,29.0,17.0,2.0,...,0,0,0,1,0,18096.0,9.803446,1.018457,0.018289,1.001866
5665,18910.0,9.847446,Vancouver Canucks,Calgary Flames,7.0,28.2,48.0,26.0,15.0,7.0,...,0,0,0,1,1,18910.0,9.847446,1.000000,0.000000,1.000000


In [33]:
# store into giant csv file
file_name = 'attendance data with features.csv'
file_path = os.path.join(file_name)
df_vars_total.to_csv(file_path, index = False)