In [44]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import re
import matplotlib.pyplot as plt


In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
#determines who is ahead at a specific event (row) in the game
def ahead_team(n_row,goals_all):
    away_score=0
    home_score=0
    for idx,row in goals_all[goals_all['n_row']<n_row].iterrows():
        if (row['description'].split(' ')[0]==row['away_team']):
            away_score+=1
        else:
            home_score+=1
    if (away_score>home_score):
        return('away')
    elif (away_score<home_score):
        return('home')
    else:
        return('none')

In [47]:
def get_goalie_pulls(df):
    
    #convert to total seconds to end of the game
    df['total_sec']=df['minute'].astype(int)*60+df['second'].astype(int)
    #deal with the format
    df['n_row']=df['n_row'].apply(lambda x: int(x))
    
    #replace empty line cells with a G so they will not be mistaken for empty nets
    df.loc[df['away_line'].isna(),'away_line']='G'
    df.loc[df['home_line'].isna(),'home_line']='G'
    
    #save the game goals and who scored them
    goals_all=df[df.event=='GOAL']
    
    #determining the events without a goalie by checking individual lines without G in team lines during the third period
    goalie_out=df.loc[((-df['away_line'].str.contains('G'))|(-df['home_line'].str.contains('G')))&
                     (df['total_sec']<360)&(df['period']==3)]
    #check who is winning during the goalie out
    goalie_out['winning']=goalie_out.n_row.apply(lambda x: ahead_team(x,goals_all))
    
    if len(goalie_out):
        #delayed penalty
        delayed_penalty=goalie_out[((goalie_out['home_line'].str.contains('G'))&(goalie_out.winning=='away'))|
                                   ((goalie_out['away_line'].str.contains('G'))&(goalie_out.winning=='home'))]
        if len(delayed_penalty):
            goalie_out=goalie_out.drop(delayed_penalty.index)
            if (len(goalie_out)==0):
                return None
            
        
        goalie_out_sec=goalie_out.total_sec.values[0]
        

        success=0
        goal=goalie_out.loc[(goalie_out.event=='GOAL')]    
        if len(goal):
            if(goal['away_line'].str.contains('G').values[0] and 
               (goal.description.values[0].split(' ')[0]==goal.away_team.values[0])):
                success=-1
            elif(goal['home_line'].str.contains('G').values[0] and 
                 (goal.description.values[0].split(' ')[0]==goal.home_team.values[0])):
                success=-1
            else:
                success=1
                
        return([goalie_out_sec,success])

In [48]:
%%time
pull_time=[]
success=[]
game=[]
goal_number=[]

for file in os.listdir('./data/game/'):    

    game_data = pd.read_csv('./data/game/'+file)
    #print(file)
    
    if get_goalie_pulls(game_data) is not None:
        
        
        game_pull_time,game_success=get_goalie_pulls(game_data)
    
        pull_time.append(game_pull_time)
        success.append(game_success)
        game.append(file)
        
    #keep number of goals outside the last 6 minutes in the third period
    goal_number.append(len(game_data[(game_data['event']=='GOAL')
                                   &(game_data['period']==3)&(game_data['minute']>5)]))


Wall time: 8min 20s


In [50]:
#save the processed data
pd.DataFrame(list(zip(game, pull_time,success)),
                 columns =['game', 'total_sec','success']).to_csv('./data/processed/goalie_pull_data_2011_2021.csv',index=False)


In [51]:
print('Probablity of normal goal per second for two teams was',
      round(np.sum(goal_number)/(14*60)/len(os.listdir('./data/game/')),5))

Probablity of normal goal per second for two teams was 0.00141
