In [1]:
# import standard modules
import pandas as pd
import numpy as np
import re
from datetime import datetime

# import web modules
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

# import ML modules
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix, accuracy_score

In [2]:
# ignore warnings (for deprecation warnings)
import warnings
warnings.filterwarnings('ignore')

## Prepare Game Results

In [3]:
# import '21-'22 season game schedule
df_games_season = pd.read_csv('../data/games/21-22_season.csv')

In [4]:
# add 'season' and 'home_or_away' columns for conformity to historical data
# all games are 'HOME' due to team 1 always being home team in this season data
df_games_season['season'] = 2021
df_games_season['home_or_away'] = 'HOME'

In [5]:
# rename columns to confrom to historical data
game_results = df_games_season.rename(columns={"Date": "gameDate", "Home": "playerTeam", "Visitor": "opposingTeam", "Score.1": "goalsFor", "Score": "goalsAgainst"})

In [6]:
# define fucntion to create column of game results
# 2 is a win for 'playerTeam'
# 1 is a win for 'opposingTeam'
# 0 is a game that went to shootout
def game_result_label_race(row):
    if row['goalsFor'] > row['goalsAgainst']:
        return 'team 1'
    if row['goalsFor'] < row['goalsAgainst']:
        return 'team 2'

# define fucntions that encode home team for t1 and t2
# 1 if team1 is home team
# 2 if team2 is home team
def home_team_t1_label_race(row):
    if row['home_or_away'] == 'HOME':
        return 1
    if row['home_or_away'] == 'AWAY':
        return 0
    
    
def home_team_t2_label_race(row):
    if row['home_or_away'] == 'HOME':
        return 0
    if row['home_or_away'] == 'AWAY':
        return 1   

In [7]:
# create placeholder values for encoded game result
game_results['home_or_away_t1'] = np.nan
game_results['home_or_away_t2'] = np.nan

# apply functions to placeholder columns of game results and encode home team
game_results['result'] = game_results.apply(lambda row: game_result_label_race(row), axis=1)
game_results['home_or_away_t1'] = game_results.apply(lambda row: home_team_t1_label_race(row), axis=1)
game_results['home_or_away_t2'] = game_results.apply(lambda row: home_team_t2_label_race(row), axis=1)

In [8]:
# rename, drop and order columns for usability + reindexing
game_results.rename(columns={'playerTeam': 'team1', 'opposingTeam': 'team2'}, inplace=True)
game_results.drop(['goalsFor', 'goalsAgainst', 'home_or_away'], axis=1, inplace=True)
game_results = game_results[['gameDate', 'season', 'team1', 'team2', 'result', 'home_or_away_t1', 'home_or_away_t2']].sort_values('gameDate').reset_index(drop=True)

# convert date to python date
game_results['gameDate']= pd.to_datetime(game_results['gameDate'])

In [9]:
# create list of team names
teams = ['Anaheim Ducks',
         'Arizona Coyotes',
         'Boston Bruins',
         'Buffalo Sabres',
         'Calgary Flames',
         'Carolina Hurricanes',
         'Chicago Blackhawks',
         'Colorado Avalanche',
         'Columbus Blue Jackets',
         'Dallas Stars',
         'Detroit Red Wings',
         'Edmonton Oilers',
         'Florida Panthers',
         'Los Angeles Kings',
         'Minnesota Wild',
         'Montreal Canadiens',
         'Nashville Predators',
         'New Jersey Devils',
         'New York Islanders',
         'New York Rangers',
         'Ottawa Senators',
         'Philadelphia Flyers',
         'Pittsburgh Penguins',
         'San Jose Sharks',
         'Seattle Kraken',
         'St Louis Blues',
         'Tampa Bay Lightning',
         'Toronto Maple Leafs',
         'Vancouver Canucks',
         'Vegas Golden Knights',
         'Washington Capitals',
         'Winnipeg Jets']

# create list of team abbreviations
teams_abv = ['ANA', 'ARI', 'BOS', 'BUF', 'CGY', 'CAR', 'CHI', 'COL', 
             'CBJ', 'DAL', 'DET', 'EDM', 'FLA', 'LAK', 'MIN', 'MTL',
             'NSH', 'NJD', 'NYI', 'NYR', 'OTT', 'PHI', 'PIT', 'SJS', 
             'SEA', 'STL', 'TBL', 'TOR', 'VAN', 'VGK', 'WSH', 'WPG']

# create functions that replace team names with team abbreviations
def t1_abv_label_race(row):
    for i in range(32):
        if row['team1'] == teams[i]:
            return teams_abv[i]

        
def t2_abv_label_race(row):
    for i in range(32):
        if row['team2'] == teams[i]:
            return teams_abv[i]   

In [10]:
# remove periods from team names in df
game_results['team1'] = game_results['team1'].str.replace('.', '')
game_results['team2'] = game_results['team2'].str.replace('.', '')

# apply abbreviation functions to team names in df
game_results['team1'] = game_results.apply(lambda row: t1_abv_label_race(row), axis=1)
game_results['team2'] = game_results.apply(lambda row: t2_abv_label_race(row), axis=1)

# separate already played games from unplayed games
upcoming_games = game_results[game_results.isna().any(axis=1)]
game_results = game_results[~game_results.isna().any(axis=1)]

In [11]:
# create object for todays date
today = datetime.today().strftime('%Y-%m-%d')
# select todays games
todays_games = upcoming_games.loc[upcoming_games['gameDate'] == today].drop(['result'], axis=1).reset_index(drop=True)

## Web

In [13]:
# line/pairing data
# query MoneyPuck for up-to-date line/pairing data
req = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2021/regular/lines.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

# create list of tracked statistics
tracked_statistics1 = ['name', 'season', 'team', 'icetime',
                      'flurryScoreVenueAdjustedxGoalsFor', 'xOnGoalFor', 'reboundxGoalsFor',
                      'penaltiesAgainst', 'takeawaysFor',
                      'lowDangerxGoalsFor', 'mediumDangerxGoalsFor','highDangerxGoalsFor',
                      'flurryScoreVenueAdjustedxGoalsAgainst', 'xOnGoalAgainst', 'reboundxGoalsAgainst',
                      'penaltiesFor', 'takeawaysAgainst',
                      'lowDangerxGoalsAgainst', 'mediumDangerxGoalsAgainst','highDangerxGoalsAgainst']

# import queried csv and filter for tracked stats
df_lines = pd.read_csv(content)
df_lines = df_lines[tracked_statistics1]

# select stats to regularize by games played
lines_reg = tracked_statistics1[4:]

# clean team names
df_lines['team'] = df_lines['team'].str.replace('.', '')

# isolate line and pairing stats from each other 
df_lines = df_lines[tracked_statistics1].reset_index(drop=True)

# regularize stats to icetime
df_lines[lines_reg] = df_lines[lines_reg].divide(df_lines['icetime'], axis='index').multiply(50000, axis='index')

# stats to weigh
major_stats = ['flurryScoreVenueAdjustedxGoalsFor', 'flurryScoreVenueAdjustedxGoalsAgainst', 
               'penaltiesFor', 'penaltiesAgainst',
               'takeawaysFor', 'takeawaysAgainst',
               'mediumDangerxGoalsFor', 'mediumDangerxGoalsAgainst',
               'highDangerxGoalsFor', 'highDangerxGoalsAgainst']

minor_stats = ['xOnGoalFor', 'xOnGoalAgainst', 
               'reboundxGoalsFor', 'reboundxGoalsAgainst',
               'lowDangerxGoalsFor', 'lowDangerxGoalsAgainst']

In [14]:
# skater data
# query MoneyPuck for up-to-date skater data
req = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2021/regular/skaters.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

# create list of tracked statistics
tracked_statistics2 = ['name', 'position', 'season', 'team', 'icetime',
                      'OnIce_F_flurryScoreVenueAdjustedxGoals', 'OnIce_F_xOnGoal', 'OnIce_F_reboundxGoals',
                      'penaltiesDrawn', 'I_F_takeaways',
                      'OnIce_F_lowDangerxGoals', 'OnIce_F_mediumDangerxGoals', 'OnIce_F_highDangerxGoals', 
                      'OnIce_A_flurryScoreVenueAdjustedxGoals', 'OnIce_A_xOnGoal', 'OnIce_A_reboundxGoals',
                      'penalties', 'I_F_giveaways',
                      'OnIce_A_lowDangerxGoals', 'OnIce_A_mediumDangerxGoals', 'OnIce_A_highDangerxGoals']

# import queried csv and filter for 5on5 stats
df_skaters = pd.read_csv(content)
df_skaters = df_skaters.loc[(df_skaters['situation'] == '5on5')]

# isolate last name from full name
df_skaters['name'] = df_skaters['name'].str.split().str[1]

# apply tracked statists and rename columns to conform to line and team data
df_skaters = df_skaters[tracked_statistics2]
df_skaters.columns = tracked_statistics2

In [15]:
# goalie data
# query MoneyPuck for up-to-date goalie data
req1 = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2021/regular/goalies.csv')
req1.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content1 = urlopen(req1)

# query MoneyPuck for goalie data column names due to occasional uploads with no headers
req2 = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2020/regular/goalies.csv')
req2.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content2 = urlopen(req2)

# import queried csvs and apply header to up-to-date data
df_goalies = pd.read_csv(content1, header=None)
df_goalies_labels = pd.read_csv(content2)
df_goalies = pd.DataFrame(data=df_goalies.values, columns=df_goalies_labels.columns)

# create list of tracked statistics
tracked_statistics3 = ['name', 'team', 'season', 'games_played',
                      'xRebounds',  'xOnGoal', 'xPlayContinuedInZone',
                      'lowDangerxGoals', 'mediumDangerxGoals', 'highDangerxGoals',
                      'xPlayStopped', 'xPlayContinuedOutsideZone', 'xFreeze']


# select stats to regularize by games played
goalie_reg = tracked_statistics3[4:]

# isolate gaolie stats from all situations and drop columns for usability 
df_goalies = df_goalies[tracked_statistics3].loc[(df_goalies['situation'] == '5on5')]
df_goalies['team'] = df_goalies['team'].str.replace('.', '')

# scale goalie stats for wieghting
scaler = MinMaxScaler()
df_goalies[goalie_reg] = scaler.fit_transform(df_goalies[goalie_reg])

# weight stats
## weight minor stats
goalie_weighting = goalie_reg[:6]
df_goalies[goalie_weighting] = df_goalies[goalie_weighting] * 1
## weight major stats
goalie_weighting = goalie_reg[6:]
df_goalies[goalie_weighting] = df_goalies[goalie_weighting] * 2

# create goalie metric and drop old stats
df_goalies['goalie_strength_neg'] = df_goalies.loc[:,'xPlayStopped':'xFreeze'].mean(axis = 1)
df_goalies['goalie_strength_pos'] = df_goalies.loc[:,'xRebounds':'highDangerxGoals'].mean(axis = 1).mul(-.5)
df_goalies['goalie_strength'] = df_goalies['goalie_strength_neg'] + df_goalies['goalie_strength_pos']
df_goalies.drop(goalie_reg, axis=1, inplace=True)
df_goalies.drop(['goalie_strength_neg', 'goalie_strength_pos', 'games_played'], axis=1, inplace=True)

# apply value to 'season' column to resolve column summing issues
df_goalies['season'] = 2021

# split off first name leaving only last name
df_goalies['name'] = df_goalies['name'].str.split().str[1]

In [16]:
# team data
# query MoneyPuck for up-to-date team data
req1 = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2021/regular/teams.csv')
req1.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content1 = urlopen(req1)

# query MoneyPuck for team data column names due to occasional uploads with no headers
req2 = Request('https://moneypuck.com/moneypuck/playerData/seasonSummary/2020/regular/teams.csv')
req2.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content2 = urlopen(req2)

# import queried csvs and apply header to up-to-date data
df_teams = pd.read_csv(content1, header=None)
df_teams_labels = pd.read_csv(content2)
df_teams = pd.DataFrame(data=df_teams.values, columns=df_teams_labels.columns)

# create list of tracked statistics
tracked_statistics = ['name', 'season',
                      'xGoalsPercentage',
                      'flurryScoreVenueAdjustedxGoalsFor', 'penaltiesFor', 'dZoneGiveawaysAgainst',
                      'flurryScoreVenueAdjustedxGoalsAgainst', 'penaltiesAgainst', 'dZoneGiveawaysFor']

# filter tracked stats for all major game situations
team_stats_5on5 = df_teams[tracked_statistics].loc[(df_teams['situation'] == '5on5')].add_suffix('_5on5').rename(columns={"name_5on5": "name", "season_5on5": "season"})
team_stats_5on4 = df_teams[tracked_statistics].loc[(df_teams['situation'] == '5on4')].add_suffix('_5on4').rename(columns={"name_5on4": "name", "season_5on4": "season"})
team_stats_4on5 = df_teams[tracked_statistics].loc[(df_teams['situation'] == '4on5')].add_suffix('_4on5').rename(columns={"name_4on5": "name", "season_4on5": "season"})

# create 5on5 team metric and drop old stats
team_stats_5on5['team_strength_pos'] = team_stats_5on5.loc[:,'flurryScoreVenueAdjustedxGoalsFor_5on5':'dZoneGiveawaysAgainst_5on5'].mean(axis = 1)
team_stats_5on5['team_strength_neg'] = team_stats_5on5.loc[:,'flurryScoreVenueAdjustedxGoalsAgainst_5on5':].mean(axis = 1).mul(-.5)
team_stats_5on5['team_strength_5on5'] = team_stats_5on5.iloc[:, -2:].sum(axis=1)
team_stats_5on5.drop(['team_strength_pos', 'team_strength_neg'], axis=1, inplace=True)

# create 5on4 team metric and drop old stats
team_stats_5on4['team_strength_pos'] = team_stats_5on4.loc[:,'flurryScoreVenueAdjustedxGoalsFor_5on4':'dZoneGiveawaysAgainst_5on4'].mean(axis = 1)
team_stats_5on4['team_strength_neg'] = team_stats_5on4.loc[:,'flurryScoreVenueAdjustedxGoalsAgainst_5on4':].mean(axis = 1).mul(-.5)
team_stats_5on4['team_strength_5on4'] = team_stats_5on4.iloc[:, -2:].sum(axis=1)
team_stats_5on4.drop(['team_strength_pos', 'team_strength_neg', 'xGoalsPercentage_5on4'], axis=1, inplace=True)

# create 4on5 team metric and drop old stats
team_stats_4on5['team_strength_pos'] = team_stats_4on5.loc[:,'flurryScoreVenueAdjustedxGoalsFor_4on5':'dZoneGiveawaysAgainst_4on5'].mean(axis = 1)
team_stats_4on5['team_strength_neg'] = team_stats_4on5.loc[:,'flurryScoreVenueAdjustedxGoalsAgainst_4on5':].mean(axis = 1).mul(-.5)
team_stats_4on5['team_strength_4on5'] = team_stats_4on5.iloc[:, -2:].sum(axis=1)
team_stats_4on5.drop(['team_strength_pos', 'team_strength_neg', 'xGoalsPercentage_4on5'], axis=1, inplace=True)

# merge situation stats to single df
team_stats = pd.merge(team_stats_5on5, team_stats_5on4, how='left', on=['name', 'season'])
team_stats = pd.merge(team_stats, team_stats_4on5, how='left', on=['name', 'season'])

# clean team names and fill any potential nans with median 
team_stats['name'] = team_stats['name'].str.replace('.', '')
team_stats.fillna(team_stats.median(), inplace=True)

# df filtered to only used stats
df_teams = team_stats[['name', 'season', 'xGoalsPercentage_5on5', 'team_strength_5on5', 'team_strength_5on4', 'team_strength_4on5']]

In [17]:
# time series data
# query MoneyPuck for historical game data
req = Request('https://moneypuck.com/moneypuck/playerData/careers/gameByGame/all_teams.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

# import queried game data and sort by date
ts = pd.read_csv(content)
ts.sort_values('gameDate', inplace=True)

# filter df by situation, playoff game status and clean team names
ts = ts.loc[(ts['situation'] == '5on5') & (ts['playoffGame'] == 0)]
ts[['team']] = ts[['team']].apply(lambda x: x.str.replace('.', ''))

# convert 'gameDate' column to datetime dtype
ts['gameDate'] =  pd.to_datetime(ts['gameDate'], format = '%Y%m%d')

# filter out unused columns
ts = ts[['team', 'gameDate', 'xGoalsPercentage']]

# create list of unique team names
teams_list = list(ts['team'].unique())

# initialise timeseries dict
timeseries_teams = {}

# create dict items for each team
for i  in range(len(teams_list)):
    timeseries_teams[f'{teams_list[i]}'] = ts.loc[ts['team'] == teams_list[i]]

# create time series features for each team
for i in teams_list:
    df_ts_team = timeseries_teams[i].copy()
    df_ts_team['xGoalsPercentage_last_3'] = df_ts_team['xGoalsPercentage'].rolling(window=3, closed= "left").mean().fillna(.5)
    df_ts_team['xGoalsPercentage_last_5'] = df_ts_team['xGoalsPercentage'].rolling(window=5, closed= "left").mean().fillna(.5)
    df_ts_team['xGoalsPercentage_last_10'] = df_ts_team['xGoalsPercentage'].rolling(window=10, closed= "left").mean().fillna(.5)
    timeseries_teams[i] = df_ts_team

# create df from dict items
df_ts = pd.concat(timeseries_teams.values(), ignore_index=True).drop(['xGoalsPercentage'], axis=1)

# fix team names
df_ts = df_ts.replace(['TB', 'NJ', 'LA', 'SJ'], ['TBL', 'NJD', 'LAK', 'SJS'])

# sort df and drop duplicate teams
df_ts = df_ts.sort_values(['team', 'gameDate']).drop_duplicates('team', keep='last')

In [18]:
# create list of team names
teams = [x.lower() for x in ['Anaheim Ducks',
                             'Arizona Coyotes',
                             'Boston Bruins',
                             'Buffalo Sabres',
                             'Calgary Flames',
                             'Carolina Hurricanes',
                             'Chicago Blackhawks',
                             'Colorado Avalanche',
                             'Columbus Blue Jackets',
                             'Dallas Stars',
                             'Detroit Red Wings',
                             'Edmonton Oilers',
                             'Florida Panthers',
                             'Los Angeles Kings',
                             'Minnesota Wild',
                             'Montreal Canadiens',
                             'Nashville Predators',
                             'New Jersey Devils',
                             'New York Islanders',
                             'New York Rangers',
                             'Ottawa Senators',
                             'Philadelphia Flyers',
                             'Pittsburgh Penguins',
                             'San Jose Sharks',
                             'Seattle Kraken',
                             'St Louis Blues',
                             'Tampa Bay Lightning',
                             'Toronto Maple Leafs',
                             'Vancouver Canucks',
                             'Vegas Golden Knights',
                             'Washington Capitals',
                             'Winnipeg Jets']]

# create list of team abbreviations 
teams_abv = ['ANA', 'ARI', 'BOS', 'BUF', 'CGY', 'CAR', 'CHI', 'COL', 
             'CBJ', 'DAL', 'DET', 'EDM', 'FLA', 'LAK', 'MIN', 'MTL',
             'NSH', 'NJD', 'NYI', 'NYR', 'OTT', 'PHI', 'PIT', 'SJS', 
             'SEA', 'STL', 'TBL', 'TOR', 'VAN', 'VGK', 'WSH', 'WPG']

# change team name format to conform to game data
for i in range(len(teams)):
    teams[i] = teams[i].replace(" ", "-")

In [19]:
# confrom team names in df to game data
df_lines['name'] = df_lines['name'].str.split('-')
df_lines['name'] = df_lines['name'].apply(sorted)
df_lines['name'] = df_lines['name'].apply(lambda x: '-'.join(map(str, x)))

In [20]:
# headless firefox setup
options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

# initialise lineup dict
lineups = {}

In [21]:
# create daily lineups
for i in range(len(teams)):
    # query DailyFaceoff for single team line combos
    driver.get(f"https://www.dailyfaceoff.com/teams/{teams[i]}/line-combinations/")
    content = driver.page_source
    soup = BeautifulSoup(content)
    # initialize soup
    players = soup.findAll('span', attrs={'class':'player-name'})
    players = [str(i) for i in players]
    
    # isolate player names
    for j in range(len(players)):
        s = players[j]
        # isolate player name
        players[j] = re.search('<span class="player-name">(.*)</span>', s).group(1).split()[-1]
            
    # separate skaters from goalies and create lines from skaters
    skaters = players[:18] + players[-2:]
    lines = [skaters[:3], skaters[3:6], skaters[6:9], skaters[9:12], skaters[12:14], skaters[14:16], skaters[16:18]]
    goalies = [players[36], players[37]]
    
    # create lines dataframe 
    df_s = pd.DataFrame({'name': lines})
    df_s['name'][0:7] = df_s['name'][0:7].apply(sorted)
    df_s['name'][0:7] = df_s['name'][0:7].apply(lambda x: '-'.join(map(str, x)))
    df_s = pd.merge(df_s, df_lines, how='left', on='name')
    
    # create line stat approximations for lines that aren't in line list
    for k in range(7):
        
        # check for null lines
        if df_s.iloc[k].isna().sum() != 0:
            
            # isolate forwards
            if k < 4:
                # pull stats for players in dummy line
                line = df_s['name'][k].split('-')
                line_players = pd.concat([df_skaters.loc[(df_skaters['name'] == line[0]) & (df_skaters['team'] == teams_abv[i])],
                                          df_skaters.loc[(df_skaters['name'] == line[1]) & (df_skaters['team'] == teams_abv[i])],
                                          df_skaters.loc[(df_skaters['name'] == line[2]) & (df_skaters['team'] == teams_abv[i])]])
                # mean individual stats
                line_players.loc[0] = line_players.mean()
                line_players = line_players.loc[[0]]
                # create line name and drop unused columns
                line_players['name'] = '-'.join(line)
                line_players['team'] = teams_abv[i]
                line_players.drop(['position'], axis=1, inplace=True)
                # write dummy line to lines df
                df_s.iloc[k] = line_players.iloc[0]
                
            # isolate defense           
            else:
                # pull stats for players in dummy line
                line = df_s['name'][k].split('-')
                line_players = pd.concat([df_skaters.loc[(df_skaters['name'] == line[0]) & (df_skaters['team'] == teams_abv[i])],
                                          df_skaters.loc[(df_skaters['name'] == line[1]) & (df_skaters['team'] == teams_abv[i])]])
                # mean individual stats
                line_players.loc[0] = line_players.mean()
                line_players = line_players.loc[[0]]
                # create pairing name and drop unused columns
                line_players['name'] = '-'.join(line)
                line_players['team'] = teams_abv[i]
                line_players.drop(['position'], axis=1, inplace=True)
                # write dummy pairing to lines df
                df_s.iloc[k] = line_players.iloc[0]
        
        # handles any extra players in df
        else:
            pass
                     
    # weight stats
    df_s[major_stats] = df_s[major_stats] * 2
    df_s[minor_stats] = df_s[minor_stats] * 1

    # fix names and remove duplicates
    df_s['name'] = df_s['name'].str.split('-').apply(sorted)
    df_s['name'] = df_s['name'].apply(lambda x: '-'.join(map(str, x)))
    df_s = df_s.sort_values('icetime', ascending=False)
    df_s = df_s.drop_duplicates(subset='name', keep="first")
    
    # create line/pair metrics and drop old stats
    df_s['strength_pos'] = df_s.loc[:,'flurryScoreVenueAdjustedxGoalsFor':'highDangerxGoalsFor'].mean(axis = 1)
    df_s['strength_neg'] = df_s.loc[:,'flurryScoreVenueAdjustedxGoalsAgainst':].mean(axis = 1).mul(-.5)
    df_s['strength'] = df_s.iloc[:, -2:].sum(axis=1)
    df_s.drop(['strength_pos', 'strength_neg', 'icetime'], axis=1, inplace=True)
    
    # create lines dataframe from skaters dataframe
    df_s = df_s[['name', 'season', 'team', 'strength']].sort_index().reset_index(drop=True).fillna(df_s.mode().iloc[0])
    df_l = df_s[['season', 'team']].iloc[:1]

    # weight lines by average icetime
    df_l['line1_strength'] = df_s['strength'][0] * .32
    df_l['line2_strength'] = df_s['strength'][1] * .27
    df_l['line3_strength'] = df_s['strength'][2] * .22
    df_l['line4_strength'] = df_s['strength'][3] * .19
    df_l['forward_strength'] = df_l.loc[:, 'line1_strength':'line4_strength'].sum(axis=1)
    
    # weight pairing by average icetime
    df_l['pair1_strength'] = df_s['strength'][4] * .39
    df_l['pair2_strength'] = df_s['strength'][5] * .33
    df_l['pair3_strength'] = df_s['strength'][6] * .28
    df_l['defense_strength'] = df_l.loc[:, 'pair1_strength':'pair3_strength'].sum(axis=1)
    
    # drop unused columns
    df_s = df_l.drop(['line1_strength', 'line2_strength', 'line3_strength', 'line4_strength',
                              'pair1_strength', 'pair2_strength', 'pair3_strength'], axis=1).reset_index(drop=True)
    
    # create goalie df
    df_g = pd.DataFrame({'name': goalies, 'team': [teams_abv[i], teams_abv[i]]})
    df_g = pd.merge(df_g, df_goalies, how='left', on=['name', 'team']).drop(['name'], axis=1).reset_index(drop=True)
    df_g.drop(df_g.tail(1).index,inplace=True)    
    df_g['team'] = teams_abv[i]
    df_g['season'] = 2021
    
    # create team df
    df_t = df_teams.loc[df_teams['name'] == teams_abv[i]]
    df_t = df_t.rename(columns={'name': 'team'})
     
    # create lineup dictionary
    lineups[f'{teams_abv[i]}'] = {'skaters': df_s, 'goalies': df_g, 'team': df_t}


In [22]:
# create teams list
teams = []
# creates todays stats for every team
for team in lineups:
    # set columns to correct dtypes
    lineups[team]['team']['season'] = lineups[team]['team']['season'].astype('int')
    lineups[team]['goalies']['season'] = lineups[team]['goalies']['season'].astype('int')
    # merge stats to single df
    df = pd.merge(lineups[team]['team'], lineups[team]['goalies'], how='left', on=['team', 'season'])
    df = pd.merge(df, lineups[team]['skaters'], how='left', on=['team', 'season'])
    # set 'gameDate' to todays date
    df['gameDate'] = today
    df_ts['gameDate'] = today   
    # merge team stats with timeseries stats
    df = pd.merge(df, df_ts, how='left', on=['team', 'gameDate'])
    df.drop(['gameDate'], axis=1, inplace=True)
    # append to teams list
    teams.append(df)

# fill missing values and drop duplicates
teams_df = pd.concat(teams).fillna(.75)
teams_df.drop_duplicates(keep=False,inplace=True)

In [23]:
# create t1 and t2 versions of stats
t1_team_stats = teams_df.rename(columns={'team': 'team1'})
t2_team_stats = teams_df.rename(columns={'team': 'team2'})

# merge t1 and t2 stats to todays games
input_df_2 = pd.merge(todays_games, t1_team_stats, how='left', on=['team1', 'season'])
input_df_2 = pd.merge(input_df_2, t2_team_stats, how='left', on=['team2', 'season'], suffixes=('_1', '_2'))

# change teams to single column with tuples and set as index
input_df_2['teams'] = input_df_2[['team1', 'team2']].apply(tuple, axis=1)
input_df_2 = input_df_2.drop(['team1', 'team2', 'season'], axis=1)

# change dtype of columns affected by merging
input_df_2['xGoalsPercentage_5on5_1'] = input_df_2['xGoalsPercentage_5on5_1'].astype('float')
input_df_2['xGoalsPercentage_5on5_2'] = input_df_2['xGoalsPercentage_5on5_2'].astype('float')

In [24]:
# save as csv for imputation into model
input_df_2.to_csv('../data/output/input_df_2.csv')

In [25]:
# check shape
input_df_2.shape

(6, 24)

In [26]:
# check for null values in columns
input_df_2.isnull().sum()

gameDate                      0
home_or_away_t1               0
home_or_away_t2               0
xGoalsPercentage_5on5_1       0
team_strength_5on5_1          0
team_strength_5on4_1          0
team_strength_4on5_1          0
goalie_strength_1             0
forward_strength_1            0
defense_strength_1            0
xGoalsPercentage_last_3_1     0
xGoalsPercentage_last_5_1     0
xGoalsPercentage_last_10_1    0
xGoalsPercentage_5on5_2       0
team_strength_5on5_2          0
team_strength_5on4_2          0
team_strength_4on5_2          0
goalie_strength_2             0
forward_strength_2            0
defense_strength_2            0
xGoalsPercentage_last_3_2     0
xGoalsPercentage_last_5_2     0
xGoalsPercentage_last_10_2    0
teams                         0
dtype: int64

In [27]:
# disply rows with null values
input_df_2[input_df_2.isna().any(axis=1)]

Unnamed: 0,gameDate,home_or_away_t1,home_or_away_t2,xGoalsPercentage_5on5_1,team_strength_5on5_1,team_strength_5on4_1,team_strength_4on5_1,goalie_strength_1,forward_strength_1,defense_strength_1,...,team_strength_5on5_2,team_strength_5on4_2,team_strength_4on5_2,goalie_strength_2,forward_strength_2,defense_strength_2,xGoalsPercentage_last_3_2,xGoalsPercentage_last_5_2,xGoalsPercentage_last_10_2,teams


In [28]:
# load dataset and check for errors
input_df = pd.read_csv('../data/output/input_df.csv')
# remove shootouts
input_df = input_df[input_df.result != 'shootout']

# set data and labels as X and y
X = input_df.drop(['teams', 'result', 'gameDate'], axis=1)
y = input_df['result']
Z = input_df_2.drop(['teams'], axis=1)
Z = Z[X.columns]

# encode labels
label_encoder = LabelEncoder()
label_encoded_y = label_encoder.fit_transform(y)

# scale data
scaler = StandardScaler()
scaler.fit_transform(X)

# train test split
X_train = X
y_train = y
X_test = X
y_test = y

# construct base of XGBoost model
model = xgb.XGBClassifier(
    n_jobs=-1,
    tree_method='gpu_hist',
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    
    colsample_bytree=.73,
    learning_rate=.102,
    max_depth=2,
    reg_lambda=.0775,
    subsample=.975
).fit(
    X_train, y_train,
    verbose=False,
    early_stopping_rounds=10, 
    eval_set=[(X_test, y_test)],
)

# run tests
prediction = model.predict_proba(Z) 

In [29]:
final_predictions = input_df_2[['teams']]
final_predictions['predictions'] = prediction.tolist()

In [30]:
final_predictions

Unnamed: 0,teams,predictions
0,"(BUF, WPG)","[0.5014413595199585, 0.4985586404800415]"
1,"(DET, NYR)","[0.6648868322372437, 0.33511313796043396]"
2,"(EDM, LAK)","[0.5280585289001465, 0.4719415009021759]"
3,"(VAN, STL)","[0.6662984490394592, 0.33370155096054077]"
4,"(ARI, SJS)","[0.5035085678100586, 0.4964914321899414]"
5,"(SEA, VGK)","[0.5941003561019897, 0.40589961409568787]"
