In [2]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pulp

In [3]:
# teams dictionairy
teams_dict = {
    'Anaheim Ducks': 0,
    'Boston Bruins': 1,
    'Buffalo Sabres': 2,
    'Calgary Flames': 3,
    'Carolina Hurricanes': 4,
    'Chicago Blackhawks': 5,
    'Colorado Avalanche': 6,
    'Columbus Blue Jackets': 7,
    'Dallas Stars': 8,
    'Detroit Red Wings': 9,
    'Edmonton Oilers': 10,
    'Florida Panthers': 11,
    'Los Angeles Kings': 12,
    'Minnesota Wild': 13,
    'Montreal Canadiens': 14,
    'Nashville Predators': 15,
    'New Jersey Devils': 16,
    'New York Islanders': 17,
    'New York Rangers': 18,
    'Ottawa Senators': 19,
    'Philadelphia Flyers': 20,
    'Pittsburgh Penguins': 21,
    'San Jose Sharks': 22,
    'Seattle Kraken': 23,
    'St. Louis Blues': 24,
    'Tampa Bay Lightning': 25,
    'Toronto Maple Leafs': 26,
    'Utah Hockey Club': 27,  # used to be Arizona Coyotes
    'Vancouver Canucks': 28,
    'Vegas Golden Knights': 29,
    'Washington Capitals': 30,
    'Winnipeg Jets': 31
}

teams_dict_inv = {v: k for k, v in teams_dict.items()}

In [4]:
# 2024-2025
# https://www.hockey-reference.com/leagues/NHL_2025_games.html

# url of webpage
url = "https://www.hockey-reference.com/leagues/NHL_2025_games.html"

# get tables from webpage
tables = pd.read_html(url)

# first table has regular season data
games = tables[0]

# rename columns
games = games.rename(columns = {
    'G': 'G_Visitor',
    'G.1': 'G_Home',
    'Unnamed: 6': 'OT_SO',
    'Att.': 'Attendance'
})

# convert date entries to datetime
games['Date'] = pd.to_datetime(games['Date'])

# add weekday column
games['Day'] = games['Date'].dt.day_name()

# convert time enrties to datetime
games['Time'] = pd.to_datetime(games['Time'], format = '%I:%M %p').dt.time

# for some reason the data from hockey-reference.com has an empty row
# no games were supposed to be on this day so we can just remove the row and re-index
row = games.iloc[891]
if pd.isna(row['Home']):
    games = games.drop(index = 891).reset_index(drop = True)
    
games.head()

Unnamed: 0,Date,Time,Visitor,G_Visitor,Home,G_Home,OT_SO,Attendance,LOG,Notes,Day
0,2024-10-04,13:00:00,New Jersey Devils,4.0,Buffalo Sabres,1.0,,16913.0,2:27,"at O2 Arena (Prague, CZ)",Friday
1,2024-10-05,10:00:00,Buffalo Sabres,1.0,New Jersey Devils,3.0,,16722.0,2:26,"at O2 Arena (Prague, CZ)",Saturday
2,2024-10-08,19:00:00,Boston Bruins,4.0,Florida Panthers,6.0,,19813.0,2:39,,Tuesday
3,2024-10-08,16:30:00,St. Louis Blues,3.0,Seattle Kraken,2.0,,17151.0,2:37,,Tuesday
4,2024-10-08,22:00:00,Chicago Blackhawks,2.0,Utah Hockey Club,5.0,,11131.0,2:30,,Tuesday


In [5]:
# game scheduling matrix

# pivot table to count games between teams
matchups = games.pivot_table(
    index = 'Visitor', 
    columns = 'Home', 
    values = 'Date',
    aggfunc = 'count', 
    fill_value = 0)         # a team cannot play itself

# reset column and index names
matchups.index.name = 'Away Team'
matchups.columns.name = 'Home Team'

matchups

Home Team,Anaheim Ducks,Boston Bruins,Buffalo Sabres,Calgary Flames,Carolina Hurricanes,Chicago Blackhawks,Colorado Avalanche,Columbus Blue Jackets,Dallas Stars,Detroit Red Wings,...,San Jose Sharks,Seattle Kraken,St. Louis Blues,Tampa Bay Lightning,Toronto Maple Leafs,Utah Hockey Club,Vancouver Canucks,Vegas Golden Knights,Washington Capitals,Winnipeg Jets
Away Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anaheim Ducks,0,1,1,2,1,1,1,1,2,1,...,1,2,2,1,1,2,2,2,1,2
Boston Bruins,1,0,2,1,2,1,1,1,1,2,...,1,1,1,2,2,1,1,1,1,1
Buffalo Sabres,1,2,0,1,1,1,1,2,1,2,...,1,1,1,2,1,1,1,1,2,1
Calgary Flames,2,1,1,0,1,1,1,1,2,1,...,2,2,2,1,1,2,2,2,1,2
Carolina Hurricanes,1,1,2,1,0,1,1,2,1,2,...,1,1,1,1,1,1,1,1,2,1
Chicago Blackhawks,2,1,1,2,1,0,2,1,2,1,...,2,1,2,1,1,2,2,1,1,1
Colorado Avalanche,2,1,1,2,1,2,0,1,1,1,...,2,1,2,1,1,2,2,1,1,2
Columbus Blue Jackets,1,2,1,1,2,1,1,0,1,1,...,1,1,1,2,2,1,1,1,2,1
Dallas Stars,1,1,1,1,1,2,2,1,0,1,...,1,2,1,1,1,2,1,2,1,2
Detroit Red Wings,1,1,2,1,1,1,1,2,1,0,...,1,1,1,2,2,1,1,1,2,1


In [6]:
# standings 2023-2024 season
# https://www.hockey-reference.com/leagues/NHL_2024_standings.html

# url of webpage
url = "https://www.hockey-reference.com/leagues/NHL_2024_standings.html"

# get tables from webpage
tables = pd.read_html(url)

# third table has regular season final standings
standings = tables[2]

# rename columns
standings = standings.rename(columns={
    'Unnamed: 1': 'Team',
})

# rename Arizona Coyotes to Utah Hockey Club
standings.loc[standings['Team'] == 'Arizona Coyotes', 'Team'] = 'Utah Hockey Club'

# reindex based on rank
standings = standings.set_index('Rk')

standings.head()

Unnamed: 0_level_0,Team,Overall,Shootout,Overtime,Home,Road,EAS,WES,ATL,MET,...,PAC,≤1,≥3,Oct,Nov,Dec,Jan,Feb,Mar,Apr
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,New York Rangers,55-23-4,4-3,8-1,30-11-0,25-12-4,34-14-2,21-9-2,16-6-2,18-8-0,...,10-5-1,23-4-4,24-14-0,7-2-0,9-2-1,9-5-0,5-7-2,10-1-0,10-3-1,5-3-0
2,Dallas Stars,52-21-9,4-2,8-7,26-11-4,26-10-5,18-11-3,34-10-6,7-8-1,11-3-2,...,17-3-4,23-8-9,21-11-0,5-1-1,8-4-2,9-4-1,8-4-2,6-4-3,11-2-0,5-2-0
3,Carolina Hurricanes,52-23-7,2-5,6-2,27-10-4,25-13-3,35-10-5,17-13-2,19-4-1,16-6-4,...,9-7-0,17-8-7,25-11-0,6-4-0,7-4-1,7-5-3,8-2-1,8-3-1,11-3-1,5-2-0
4,Florida Panthers,52-24-6,3-2,7-4,26-13-2,26-11-4,33-12-5,19-12-1,17-5-4,16-7-1,...,10-5-1,19-7-6,22-8-0,4-3-1,10-4-1,8-5-0,9-2-2,9-2-0,7-6-1,5-2-1
5,Winnipeg Jets,52-24-6,0-1,6-5,27-11-3,25-13-3,19-9-4,33-15-2,10-3-3,9-6-1,...,13-10-1,15-5-6,21-12-0,4-3-2,8-5-0,10-1-2,8-3-1,7-4-0,7-8-1,8-0-0


In [7]:
# attempt at scheduling with a made up utility function
# i just wanna see how this goes basically
# im calling this ~proof of concept~

# use ranking from last years season to determine how much of an impact the visiting team will have
# so 1st place == most impact
def rank(team_idx):
    team = teams_dict_inv[team_idx]
    return standings[standings['Team'] == team].index.values[0]

# inverse rank since we are maximizing (so 1st place == 32)
# imma just keep this linear for now and see what happens
def inv_rank(team_idx):
    team = teams_dict_inv[team_idx]
    return 33 - standings[standings['Team'] == team].index.values[0]

In [8]:
# H = matrix of home games for teams on certain days
# Hjt = home game for team j on day t
# from current schedule

# we will just number all days that currently have a game 0-N (idk how many days there are yet)
# j = number of team (from team_dict)

# number all unique days with games
unique_days = games['Date'].drop_duplicates().reset_index(drop = True)

# dictionary for game day to number (for ease)
day_to_id = {day: idx for idx, day in enumerate(unique_days)}

# inverse dictionary for later
id_to_day = {v: k for k, v in day_to_id.items()}

# create empty 32 x N matrix
num_teams = len(teams_dict)
num_days = len(unique_days)
H = np.zeros((num_teams, num_days), dtype = int)

# populate H matrix
for _, game in games.iterrows():
    home_team_id = teams_dict[game['Home']]
    day_id = day_to_id[game['Date']]
    H[home_team_id, day_id] = 1

# convert to df for better visualization (might not do later)
H_df = pd.DataFrame(H, index = teams_dict.keys(), columns = unique_days)

H_df.head()

Date,2024-10-04,2024-10-05,2024-10-08,2024-10-09,2024-10-10,2024-10-11,2024-10-12,2024-10-13,2024-10-14,2024-10-15,...,2025-04-08,2025-04-09,2025-04-10,2025-04-11,2025-04-12,2025-04-13,2025-04-14,2025-04-15,2025-04-16,2025-04-17
Anaheim Ducks,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
Boston Bruins,0,0,0,0,1,0,1,0,1,0,...,0,0,1,0,0,0,0,1,0,0
Buffalo Sabres,1,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,1
Calgary Flames,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0
Carolina Hurricanes,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0


In [9]:
# G = number of games team i plays against team j at team j's home
# the matchups dataframe as a matrix

G = matchups.to_numpy()

In [10]:
# use only saturday as having a major impact on game attendance
# indices where the day is saturday
sat_indices = unique_days[unique_days.dt.weekday == 5].index.tolist()

# S = 1 if day t is a saturday, 0 otherwise
S = np.zeros(num_days, dtype = int)

for idx in sat_indices:
    S[idx] = 1

In [11]:
# Xijt = team i plays team j at team j's home on day t

# we must have
# Xijt <= Hjt  (maybe don't need this one)
# sum(i) Xijt == Hjt

# sum(t) Xijt == Gij
# sum(j) Xijt == 41 (maybe don't need this one)

# create linear program (using integer programming for the binary decisions)
prob = pulp.LpProblem('scheduling', pulp.LpMaximize)

# decision variables (binary)
X = pulp.LpVariable.dicts('X', (range(num_teams), range(num_teams), range(num_days)), cat = 'Binary')

# objective function
prob += pulp.lpSum(X[i][j][t] * S[t] * inv_rank(i) for i in range(num_teams) for j in range(num_teams) for t in range(num_days))

# constraints
# sum(i) Xijt == Hjt
for j in range(num_teams):
    for t in range(num_days):
        prob += pulp.lpSum(X[i][j][t] for i in range(num_teams)) == H[j, t]

# sum(t) Xijt == Gij
for i in range(num_teams):
    for j in range(num_teams):
        prob += pulp.lpSum(X[i][j][t] for t in range(num_days)) == G[i, j]

# solve problem
prob.solve()

objective_value = prob.objective.value()

# Print the objective value
print(f'objective function value: {objective_value}')

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /opt/anaconda3/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/z6/55zl8z_1365b2p_8lkhq6tzr0000gn/T/678ca1e11e2d4209b326bbeb15b925cc-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/z6/55zl8z_1365b2p_8lkhq6tzr0000gn/T/678ca1e11e2d4209b326bbeb15b925cc-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 6725 COLUMNS
At line 763462 RHS
At line 770183 BOUNDS
At line 952456 ENDATA
Problem MODEL has 6720 rows, 182272 columns and 364544 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 9178 - 0.57 seconds
Cgl0002I 141600 variables fixed
Cgl0004I processed model has 2304 rows, 40672 columns (40672 integer (40672 of which binary)) and 81344 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I Initial state - 0 integers unsatisfied sum 

In [12]:
# put results in df
data = []

# loop through all decision variables
for i in range(num_teams):
    for j in range(num_teams):
        for t in range(num_days):
            data.append({
                'Away Team': teams_dict_inv[i],
                'Home Team': teams_dict_inv[j],
                'Day': id_to_day[t],
                'Game Scheduled': X[i][j][t].varValue
            })

# df with results
df_results = pd.DataFrame(data)

# show only scheduled games
df_results_scheduled = df_results[df_results['Game Scheduled'] == 1]
df_results_scheduled = df_results_scheduled.sort_values(by = 'Day').reset_index()

df_results_scheduled.head(10)

Unnamed: 0,index,Away Team,Home Team,Day,Game Scheduled
0,51620,Detroit Red Wings,Buffalo Sabres,2024-10-04,1.0
1,48417,Dallas Stars,New Jersey Devils,2024-10-05,1.0
2,147208,Tampa Bay Lightning,Utah Hockey Club,2024-10-08,1.0
3,121576,Pittsburgh Penguins,Florida Panthers,2024-10-08,1.0
4,43968,Columbus Blue Jackets,Seattle Kraken,2024-10-08,1.0
5,136173,Seattle Kraken,Vegas Golden Knights,2024-10-09,1.0
6,22075,Calgary Flames,Vancouver Canucks,2024-10-09,1.0
7,146141,Tampa Bay Lightning,Pittsburgh Penguins,2024-10-09,1.0
8,132791,Seattle Kraken,Edmonton Oilers,2024-10-09,1.0
9,122111,Pittsburgh Penguins,Montreal Canadiens,2024-10-09,1.0


In [13]:
# compare to first few games in the current schedule
# home teams should be the same
games.head(10)

Unnamed: 0,Date,Time,Visitor,G_Visitor,Home,G_Home,OT_SO,Attendance,LOG,Notes,Day
0,2024-10-04,13:00:00,New Jersey Devils,4.0,Buffalo Sabres,1.0,,16913.0,2:27,"at O2 Arena (Prague, CZ)",Friday
1,2024-10-05,10:00:00,Buffalo Sabres,1.0,New Jersey Devils,3.0,,16722.0,2:26,"at O2 Arena (Prague, CZ)",Saturday
2,2024-10-08,19:00:00,Boston Bruins,4.0,Florida Panthers,6.0,,19813.0,2:39,,Tuesday
3,2024-10-08,16:30:00,St. Louis Blues,3.0,Seattle Kraken,2.0,,17151.0,2:37,,Tuesday
4,2024-10-08,22:00:00,Chicago Blackhawks,2.0,Utah Hockey Club,5.0,,11131.0,2:30,,Tuesday
5,2024-10-09,22:00:00,Winnipeg Jets,6.0,Edmonton Oilers,0.0,,18347.0,2:18,,Wednesday
6,2024-10-09,19:00:00,Toronto Maple Leafs,0.0,Montreal Canadiens,1.0,,21105.0,2:18,,Wednesday
7,2024-10-09,19:30:00,New York Rangers,6.0,Pittsburgh Penguins,0.0,,18190.0,2:29,,Wednesday
8,2024-10-09,22:00:00,Calgary Flames,6.0,Vancouver Canucks,5.0,OT,18850.0,2:44,,Wednesday
9,2024-10-09,22:00:00,Colorado Avalanche,4.0,Vegas Golden Knights,8.0,,18388.0,2:27,,Wednesday


In [14]:
# find value of current system according to objective function used in optimization (using games)
# sum(i) sum(j) sum(t) Xijt * St * rank(i)

# current schedule value
current_val = 0

# iterate over each game in current schedule
for _, game in games.iterrows():
    # home and away teams and date for each game
    home_team = game['Home']
    away_team = game['Visitor']
    game_date = game['Date']
    
    # day index
    day_id = day_to_id[game_date]
    
    # index of home and away teams
    home_team_id = teams_dict[home_team]
    away_team_id = teams_dict[away_team]
    
    # compute objective function value
    current_val += S[day_id] * inv_rank(home_team_id)

# print
print(f'current schedule value: {current_val}')

current schedule value: 5455
