The 2020-2021 NHL season was played mostly without fans in attendance. As a viewer, I couldn't help but notice some changes in the way the game was played. The following code is an analysis of NHL records to determine if fan attendace helps the home team win.

In [1]:
import pandas as pd
import requests
import json
import numpy
import scipy.stats as stats

In [4]:
statSchedule1516 = pd.read_csv('./gameDataBySeason/NHL20152016.csv')
statSchedule1617 = pd.read_csv('./gameDataBySeason/NHL20162017.csv')
statSchedule1718 = pd.read_csv('./gameDataBySeason/NHL20172018.csv')
statSchedule1819 = pd.read_csv('./gameDataBySeason/NHL20182019.csv')
statSchedule1920 = pd.read_csv('./gameDataBySeason/NHL20192020.csv')
statSchedule2021 = pd.read_csv('./gameDataBySeason/NHL20202021.csv')

In [5]:
statSchedule1516.head()

Unnamed: 0,id,homeTeam,awayTeam,homeGoals,homePIM,homeShots,homePP%,homePPG,homePowerPlays,homeFaceoff%,...,awayShortHandedSaves,awayEvenStrengthSaves,awayShortHandedSA,awayEvenStrengthSA,awayPPSA,awaySavePercentage,awayPPSavePercentage,awayEvenStrengthSavePercentage,awayShortHandedSavePercentage,OT
0,2015020001,Toronto Maple Leafs,Montreal Canadians,1,4,37,33.3,1.0,3.0,53.1,...,0,33,0,33,4,0.972973,1.0,1.0,0.75,False
1,2015020002,Chicago Blackhawks,New York Rangers,2,2,34,0.0,0.0,1.0,52.6,...,0,29,0,31,3,0.941176,1.0,0.935484,1.0,False
2,2015020003,Calgary Flames,Vancouver Canucks,1,38,30,0.0,0.0,2.0,52.2,...,0,27,0,28,2,0.966667,1.0,0.964286,1.0,False
3,2015020004,Los Angeles Kings,San Jose Sharks,1,36,20,0.0,0.0,6.0,51.4,...,2,11,2,12,6,0.95,1.0,0.916667,1.0,False
4,2015020005,Boston Bruins,Winnipeg Jets,2,2,31,0.0,0.0,2.0,49.2,...,0,23,0,25,6,0.935484,1.0,0.92,1.0,False


In [6]:
def initializeEmptyTeamWinCount():
    teamsData = requests.get('https://statsapi.web.nhl.com/api/v1/teams')
    teamsData = json.loads(teamsData.text)
    teamsData = teamsData["teams"]

    teamsDict = dict()

    for team in teamsData:
        # french accents cause issues with the csv file so changing the name to english
        if team['id'] == 8:
            teamsDict['Montreal Canadians'] = {'homeWins': 0, 'awayWins': 0, 'homeGamesPlayed': 0, 'awayGamesPlayed': 0}
        else:
            teamsDict[team['name']] = {'homeWins': 0, 'awayWins': 0, 'homeGamesPlayed': 0, 'awayGamesPlayed': 0}
            
    return teamsDict

In [7]:
def countHomeAndAwayWins(schedule):
    
    winCountList = initializeEmptyTeamWinCount()
    
    for index, row in schedule.iterrows():
        homeTeam = row['homeTeam']
        awayTeam = row['awayTeam']
        homeGoals = row['homeGoals']
        awayGoals = row['awayGoals']
        
        if homeGoals > awayGoals:
            winCountList[homeTeam]['homeWins'] += 1
        else:
            winCountList[awayTeam]['awayWins'] += 1
            
        winCountList[homeTeam]['homeGamesPlayed'] += 1
        winCountList[awayTeam]['awayGamesPlayed'] += 1
        
    return winCountList

In [8]:
# fill in team records based on data from the csv files
records1516 = countHomeAndAwayWins(statSchedule1516)
records1617 = countHomeAndAwayWins(statSchedule1617)
records1718 = countHomeAndAwayWins(statSchedule1718)
records1819 = countHomeAndAwayWins(statSchedule1819)
records1920 = countHomeAndAwayWins(statSchedule1920)
records2021 = countHomeAndAwayWins(statSchedule2021)

diffs1516 = []
for team in records1516:
    if records1516[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records1516[team]['homeWins'] / records1516[team]['homeGamesPlayed']) - (records1516[team]['awayWins'] / records1516[team]['awayGamesPlayed'])
        diffs1516.append(homeAwayWinPercentageDifferential)
avgDiff1516 = numpy.mean(diffs1516) * 100

diffs1617 = []
for team in records1617:
    if records1617[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records1617[team]['homeWins'] / records1617[team]['homeGamesPlayed']) - (records1617[team]['awayWins'] / records1617[team]['awayGamesPlayed'])
        diffs1617.append(homeAwayWinPercentageDifferential)
avgDiff1617 = numpy.mean(diffs1617) * 100

diffs1718 = []
for team in records1718:
    if records1718[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records1718[team]['homeWins'] / records1718[team]['homeGamesPlayed']) - (records1718[team]['awayWins'] / records1718[team]['awayGamesPlayed'])
        diffs1718.append(homeAwayWinPercentageDifferential)
avgDiff1718 = numpy.mean(diffs1718) * 100

diffs1819 = []
for team in records1819:
    if records1819[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records1819[team]['homeWins'] / records1819[team]['homeGamesPlayed']) - (records1819[team]['awayWins'] / records1819[team]['awayGamesPlayed'])
        diffs1819.append(homeAwayWinPercentageDifferential)
avgDiff1819 = numpy.mean(diffs1819) * 100

diffs1920 = []
for team in records1920:
    if records1920[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records1920[team]['homeWins'] / records1920[team]['homeGamesPlayed']) - (records1920[team]['awayWins'] / records1920[team]['awayGamesPlayed'])
        diffs1920.append(homeAwayWinPercentageDifferential)
avgDiff1920 = numpy.mean(diffs1920) * 100

diffs2021 = []
for team in records2021:
    if records2021[team]['homeGamesPlayed'] != 0:
        homeAwayWinPercentageDifferential = (records2021[team]['homeWins'] / records2021[team]['homeGamesPlayed']) - (records2021[team]['awayWins'] / records2021[team]['awayGamesPlayed'])
        diffs2021.append(homeAwayWinPercentageDifferential)
avgDiff2021 = numpy.mean(diffs2021) * 100

print('The average difference between home win percentage and away win percentage in the 2015-16 season: ' + str(avgDiff1516) + '%')
print('The average difference between home win percentage and away win percentage in the 2016-17 season: ' + str(avgDiff1617) + '%')
print('The average difference between home win percentage and away win percentage in the 2017-18 season: ' + str(avgDiff1718) + '%')
print('The average difference between home win percentage and away win percentage in the 2018-19 season: ' + str(avgDiff1819) + '%')
print('The average difference between home win percentage and away win percentage in the 2019-20 season: ' + str(avgDiff1920) + '%')
print('The average difference between home win percentage and away win percentage in the 2020-21 season: ' + str(avgDiff2021) + '%')

winPercentageDifferentials = diffs1516 + diffs1617
winPercentageDifferentials += diffs1718
winPercentageDifferentials += diffs1819
winPercentageDifferentials += diffs1920
winPercentageDifferentials = pd.Series(winPercentageDifferentials)

#avgWinPercentageDifferentials = pd.Series([avgDiff1516, avgDiff1617, avgDiff1718, avgDiff1819, avgDiff1920, avgDiff2021])

winPercentageDifferentials2021 = pd.Series(diffs2021)

# test stat explained in the following markdown cell 
stats.ttest_1samp(winPercentageDifferentials2021, numpy.mean(winPercentageDifferentials))

The average difference between home win percentage and away win percentage in the 2015-16 season: -3.7398373983739845%
The average difference between home win percentage and away win percentage in the 2016-17 season: 3.4146341463414642%
The average difference between home win percentage and away win percentage in the 2017-18 season: 2.281667977970103%
The average difference between home win percentage and away win percentage in the 2018-19 season: 0.23603461841070097%
The average difference between home win percentage and away win percentage in the 2019-20 season: 0.003798618599377117%
The average difference between home win percentage and away win percentage in the 2020-21 season: -0.46082949308755666%


Ttest_1sampResult(statistic=-0.36982230599608534, pvalue=0.7141128581244283)

The test statistic tests whether the mean of the home and away win-percentage differentials for the 2020-2021 season is equal to the mean of the home and away win-percentage differentials from the 2015-2016 season to the 2019-2020 season. Since the test statistic was less than the p-value, we would reject the null hypothesis (the means are equal) and say that they are not equal. However, the p-value (0.714) is very large which means the evidence against the null hyptohesis is very weak. For this reason, we fail to reject the null hypothesis and say that the mean of the home and away win-percentage differentials for the 2020-2021 season is equal to the mean of the home and away win-percentage differentials from the 2015-2016 season to the 2019-2020 season.

In [9]:
winPercentageDifferentials.describe()

count    153.000000
mean       0.004471
std        0.102660
min       -0.292683
25%       -0.048780
50%        0.000000
75%        0.073171
max        0.268293
dtype: float64

The mean of the home and away win-percentage differentials from the 2015-2016 season to the 2019-2020 season was 0.4471% with a standard deviation of 10.266%. This number is slighty larger than zero which indicates a very slight home-rink advantage in the average NHL season.

Since some American NHL teams did have a small number of fans in attendance for some of the 2020-2021 season, the experiment will be repeated comparing the win-percentage differentials of just the Canadian teams in the 2020-2021 seaoson to the win-percentage differentials of the previous 5 seasons.

In [10]:
canadaDiffs2021 = []

for team in records2021:
    if team == 'Vancouver Canucks' or team == 'Edmonton Oilers' or team == 'Calgary Flames' or team == 'Winnipeg Jets' or team == 'Toronto Maple Leafs' or team == 'Ottawa Senators' or team == 'Montreal Canadians':
        homeAwayWinPercentageDifferential = (records2021[team]['homeWins'] / records2021[team]['homeGamesPlayed']) - (records2021[team]['awayWins'] / records2021[team]['awayGamesPlayed'])
        canadaDiffs2021.append(homeAwayWinPercentageDifferential)
canadaAvgDiff2021 = numpy.mean(canadaDiffs2021) * 100
print('The average difference between home win percentage and away win percentage of Canadian teams in the 2020-21 season: ' + str(avgDiff1920) + '%')

canadaWinPercentageDifferentials2021 = pd.Series(canadaDiffs2021)
stats.ttest_1samp(canadaWinPercentageDifferentials2021, numpy.mean(winPercentageDifferentials))

The average difference between home win percentage and away win percentage of Canadian teams in the 2020-21 season: 0.003798618599377117%


Ttest_1sampResult(statistic=-0.10304283179572762, pvalue=0.9212867195439313)

Again, the p-value is too large to reject the null hypothesis (the means are equal), therefore we accept the null hypothesis and say that the mean home and away win-percentage differential from the 2015-2016 season to the 2019-2020 season is equal to the mean home and away win-percentage differential of the Canadian teams in the 2020-2021 season. Since the mean win-percentage differential was close to 0 in every season that was considered, this is not surprising. This leads us to the conclusion that fan attendance does not have any significant effect on an NHL team's ability to win hockey games.