<a href="https://colab.research.google.com/github/Darknight899/Comprehensive-Assessment-/blob/main/Capstone_Project12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##1. Load Data & Libraries
Import all NHL shootouts from 2020-2021 to 2024-2025 seasons pulled from NHL API

In [14]:
##Final Capstone Project -- NHL Shootouts

#Import libraries and data
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Seasons and teams
seasons = ["20202021", "20212022", "20222023", "20232024", "20242025"]
teams = ["ANA", "UTA", "BOS", "BUF", "CGY", "CAR", "CHI", "COL", "CBJ", "DAL",
    "DET", "EDM", "FLA", "LAK", "MIN", "MTL", "NSH", "NJD", "NYI", "NYR",
    "OTT", "PHI", "PIT", "SJS", "SEA", "STL", "TBL", "TOR", "VAN", "VGK", "WSH", "WPG"]

shootout_game_ids = set() # Use a set to avoid duplicates (2 teams per game)
for season in seasons:
    for team in teams:
        url = f"https://api-web.nhle.com/v1/club-schedule-season/{team}/{season}"
        try:
            data = requests.get(url).json()
            for game in data.get('games', []):
                outcome = game.get('gameOutcome', {})
                if outcome.get('lastPeriodType') == 'SO':
                    shootout_game_ids.add(game['id'])
        except:
            continue

shootout_game_ids = list(shootout_game_ids)
print(f"Done! Found {len(shootout_game_ids)} shootout games between 2020 and 2025.")

Done! Found 440 shootout games between 2020 and 2025.


Import player and goalie data from the 440 shootouts games from NHL API

In [15]:
import requests

def extract_shootout_data(game_id):
    """
    Takes a game ID and returns a list of dictionaries,
    each representing one shootout attempt.
    """
    url = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        data = response.json()

        # 1. Create a "Roster Map" to turn IDs like 8471214 into "Alex Ovechkin"
        roster = {
            p['playerId']: f"{p['firstName']['default']} {p['lastName']['default']}"
            for p in data.get('rosterSpots', [])
        }

        # 2. Filter for only the Shootout period ('SO')
        so_plays = [
            p for p in data.get('plays', [])
            if p.get('periodDescriptor', {}).get('periodType') == 'SO'
        ]

        attempts = []
        for play in so_plays:
            details = play.get('details', {})
            # Code 505 = Goal, 506 = Save, 507 = Miss
            type_code = play.get('typeCode')

            # Identify the Shooter (Goals use 'scoringPlayerId', others use 'shootingPlayerId')
            shooter_id = details.get('scoringPlayerId') or details.get('shootingPlayerId')
            goalie_id = details.get('goalieInNetId')

            if shooter_id and goalie_id:
                attempts.append({
                    'game_id': game_id,
                    'shooter_id': shooter_id, # Added shooter_id
                    'shooter_name': roster.get(shooter_id, f"ID_{shooter_id}"),
                    'goalie_id': goalie_id,   # Added goalie_id
                    'goalie_name': roster.get(goalie_id, f"ID_{goalie_id}"),
                    'is_goal': 1 if type_code == 505 else 0,
                    'shot_type': details.get('shotType', 'unknown'),
                    'x': details.get('xCoord'),
                    'y': details.get('yCoord')
                })
        return attempts
    except requests.exceptions.RequestException as e:
        print(f"Network or HTTP error for {game_id}: {e}")
        return []
    except ValueError as e: # Handles JSON decoding errors
        print(f"JSON decoding error for {game_id}: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred for {game_id}: {e}")
        return []


all_so_attempts = []

for gid in shootout_game_ids:
    attempts_for_game = extract_shootout_data(gid)
    all_so_attempts.extend(attempts_for_game)

df_so = pd.DataFrame(all_so_attempts)
print(df_so.head())

      game_id  shooter_id      shooter_name  goalie_id      goalie_name  \
0  2022021129     8477944       Jakub Vrana    8476433  Magnus Hellberg   
1  2022021129     8480459        Pius Suter    8480981       Joel Hofer   
2  2022021129     8479385      Jordan Kyrou    8476433  Magnus Hellberg   
3  2022021129     8474102      David Perron    8480981       Joel Hofer   
4  2022021129     8477402  Pavel Buchnevich    8476433  Magnus Hellberg   

   is_goal shot_type   x  y  
0        0     wrist  84 -7  
1        0     wrist -73  2  
2        0     wrist  82 -6  
3        0     wrist -67 -1  
4        0  backhand  70 -2  


##2. Data Cleaning & Wrangling
Examine and understand the structure of the data

In [16]:
print("First 5 rows of df_so:")
print(df_so.head())

print("\nDataFrame Info:")
df_so.info()

print("\nMissing values per column:")
print(df_so.isnull().sum())

print("\nNumber of duplicate rows:")
print(df_so.duplicated().sum())

print("\nDescriptive statistics for numerical columns:")
print(df_so.describe())

First 5 rows of df_so:
      game_id  shooter_id      shooter_name  goalie_id      goalie_name  \
0  2022021129     8477944       Jakub Vrana    8476433  Magnus Hellberg   
1  2022021129     8480459        Pius Suter    8480981       Joel Hofer   
2  2022021129     8479385      Jordan Kyrou    8476433  Magnus Hellberg   
3  2022021129     8474102      David Perron    8480981       Joel Hofer   
4  2022021129     8477402  Pavel Buchnevich    8476433  Magnus Hellberg   

   is_goal shot_type   x  y  
0        0     wrist  84 -7  
1        0     wrist -73  2  
2        0     wrist  82 -6  
3        0     wrist -67 -1  
4        0  backhand  70 -2  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3115 entries, 0 to 3114
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       3115 non-null   int64 
 1   shooter_id    3115 non-null   int64 
 2   shooter_name  3115 non-null   object
 3   goalie_i

3,115 shootout attempts, 31.75% of attempts are successfull, no missing values, and several features are observed.

In [17]:
print("\nDistribution of shot_type:")
print(df_so['shot_type'].value_counts())


Distribution of shot_type:
shot_type
wrist           2095
backhand         629
snap             323
unknown           45
slap              17
poke               4
deflected          1
between-legs       1
Name: count, dtype: int64


In [18]:
shot_types_to_remove = ['poke', 'deflected', 'between-legs']
df_so = df_so[~df_so['shot_type'].isin(shot_types_to_remove)]

print("DataFrame after removing specific shot types:")
print(df_so.head())
print(f"New DataFrame shape: {df_so.shape}")

DataFrame after removing specific shot types:
      game_id  shooter_id      shooter_name  goalie_id      goalie_name  \
0  2022021129     8477944       Jakub Vrana    8476433  Magnus Hellberg   
1  2022021129     8480459        Pius Suter    8480981       Joel Hofer   
2  2022021129     8479385      Jordan Kyrou    8476433  Magnus Hellberg   
3  2022021129     8474102      David Perron    8480981       Joel Hofer   
4  2022021129     8477402  Pavel Buchnevich    8476433  Magnus Hellberg   

   is_goal shot_type   x  y  
0        0     wrist  84 -7  
1        0     wrist -73  2  
2        0     wrist  82 -6  
3        0     wrist -67 -1  
4        0  backhand  70 -2  
New DataFrame shape: (3109, 9)


In [19]:
print("\nDistribution of shot_type after removal:")
print(df_so['shot_type'].value_counts())


Distribution of shot_type after removal:
shot_type
wrist       2095
backhand     629
snap         323
unknown       45
slap          17
Name: count, dtype: int64


The shot types 'poke', 'deflected', and 'between-legs' were removed from the DataFrame because they are extremely rare in the dataset. These low counts (4, 1, 1) make these categories statistically insignificant for meaningful analysis. By removing them, we can focus on more common and analytically valuable shot types.

Furthermore, I have also noticed that there is no "deke" category which is a common move for players to perform in a shootout. Therefore, I will transform the coordinates respectively to within 10 feet of the net (+-89) to be categorized as a "deke" feature.

In [20]:
#Display shot coordinates
print("First 10 shot coordinates (x, y):")
print(df_so[['x', 'y']].head(10))

First 10 shot coordinates (x, y):
    x  y
0  84 -7
1 -73  2
2  82 -6
3 -67 -1
4  70 -2
5 -67  0
6  81 -8
7 -75  8
8 -76  0
9  81 -7


In [22]:
df_so = pd.DataFrame(all_so_attempts)

shot_types_to_remove = ['poke', 'deflected', 'between-legs']
df_so = df_so[~df_so['shot_type'].isin(shot_types_to_remove)]

allowed_shot_types = ['snap', 'wrist', 'backhand', 'slap']

# Define the new conditions for a 'deke' shot (within 10 feet of the net)
deoke_condition = (
    df_so['shot_type'].isin(allowed_shot_types) &
    (
        (df_so['x'].between(79, 89)) | # 10 feet from the net on one side
        (df_so['x'].between(-89, -79))  # 10 feet from the net on the other side
    )
)

# Apply the 'deke' category to the selected shots
df_so.loc[deoke_condition, 'shot_type'] = 'deke'

print("Distribution of shot_type after redefining 'deke' category (10 feet):")
print(df_so['shot_type'].value_counts())

Distribution of shot_type after redefining 'deke' category (10 feet):
shot_type
wrist       1538
deke        1086
snap         279
backhand     145
unknown       45
slap          16
Name: count, dtype: int64


In [25]:
shot_types_to_remove_final = ['unknown', 'slap']
df_so = df_so[~df_so['shot_type'].isin(shot_types_to_remove_final)]

print("Distribution of shot_type after removing 'unknown' and 'slap':")
print(df_so['shot_type'].value_counts())
print(f"New DataFrame shape: {df_so.shape}")

Distribution of shot_type after removing 'unknown' and 'slap':
shot_type
wrist       1538
deke        1086
snap         279
backhand     145
Name: count, dtype: int64
New DataFrame shape: (3048, 9)


Include player and goalie handedness as features.

In [26]:
unique_shooter_ids = df_so['shooter_id'].unique()
unique_goalie_ids = df_so['goalie_id'].unique()

print(f"Number of unique shooter IDs: {len(unique_shooter_ids)}")
print(f"Number of unique goalie IDs: {len(unique_goalie_ids)}")

Number of unique shooter IDs: 464
Number of unique goalie IDs: 126


In [27]:
import requests

def fetch_player_data(player_id):
    url = f"https://api-web.nhle.com/v1/player/{player_id}/landing"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for player ID {player_id}: {e}")
        return None

player_handedness = {}

# Fetch data for unique shooters
for player_id in unique_shooter_ids:
    data = fetch_player_data(player_id)
    if data and 'shootsCatches' in data:
        player_handedness[player_id] = data['shootsCatches']
    else:
        player_handedness[player_id] = 'Unknown'

# Fetch data for unique goalies
for player_id in unique_goalie_ids:
    data = fetch_player_data(player_id)
    if data and 'shootsCatches' in data:
        player_handedness[player_id] = data['shootsCatches']
    else:
        player_handedness[player_id] = 'Unknown'

print(f"Fetched handedness for {len(player_handedness)} unique players/goalies.")

Fetched handedness for 590 unique players/goalies.


In [28]:
df_so['shooter_handedness'] = df_so['shooter_id'].map(player_handedness)
df_so['goalie_handedness'] = df_so['goalie_id'].map(player_handedness)

print("DataFrame with handedness information:")
print(df_so.head())

DataFrame with handedness information:
      game_id  shooter_id      shooter_name  goalie_id      goalie_name  \
0  2022021129     8477944       Jakub Vrana    8476433  Magnus Hellberg   
1  2022021129     8480459        Pius Suter    8480981       Joel Hofer   
2  2022021129     8479385      Jordan Kyrou    8476433  Magnus Hellberg   
3  2022021129     8474102      David Perron    8480981       Joel Hofer   
4  2022021129     8477402  Pavel Buchnevich    8476433  Magnus Hellberg   

   is_goal shot_type   x  y shooter_handedness goalie_handedness  
0        0      deke  84 -7                  L                 L  
1        0     wrist -73  2                  L                 L  
2        0      deke  82 -6                  R                 L  
3        0     wrist -67 -1                  R                 L  
4        0  backhand  70 -2                  L                 L  


Verify the data



In [29]:
print("DataFrame Info after adding handedness:")
df_so.info()

print("\nMissing values in handedness columns:")
print(df_so[['shooter_handedness', 'goalie_handedness']].isnull().sum())

DataFrame Info after adding handedness:
<class 'pandas.core.frame.DataFrame'>
Index: 3048 entries, 0 to 3114
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   game_id             3048 non-null   int64 
 1   shooter_id          3048 non-null   int64 
 2   shooter_name        3048 non-null   object
 3   goalie_id           3048 non-null   int64 
 4   goalie_name         3048 non-null   object
 5   is_goal             3048 non-null   int64 
 6   shot_type           3048 non-null   object
 7   x                   3048 non-null   int64 
 8   y                   3048 non-null   int64 
 9   shooter_handedness  3048 non-null   object
 10  goalie_handedness   3048 non-null   object
dtypes: int64(6), object(5)
memory usage: 285.8+ KB

Missing values in handedness columns:
shooter_handedness    0
goalie_handedness     0
dtype: int64



Convert the 'is_goal' numerical column (0 and 1) to descriptive string labels ('No Goal' and 'Goal') enhances the clarity and interpretability of the dataset.

In [34]:
df_so['is_goal'] = df_so['is_goal'].map({0: 'No Goal', 1: 'Goal'})

print("Value counts for 'is_goal' after transformation:")
print(df_so['is_goal'].value_counts())

print("\nFirst 5 rows with updated 'is_goal' column:")
print(df_so[['shooter_name', 'goalie_name', 'shot_type', 'is_goal']].head())

Value counts for 'is_goal' after transformation:
is_goal
No Goal    2066
Goal        982
Name: count, dtype: int64

First 5 rows with updated 'is_goal' column:
       shooter_name      goalie_name shot_type  is_goal
0       Jakub Vrana  Magnus Hellberg      deke  No Goal
1        Pius Suter       Joel Hofer     wrist  No Goal
2      Jordan Kyrou  Magnus Hellberg      deke  No Goal
3      David Perron       Joel Hofer     wrist  No Goal
4  Pavel Buchnevich  Magnus Hellberg  backhand  No Goal


Sample of 5 shootout attempts

In [35]:
# Get the first 4 non-goal attempts
non_goal_samples = df_so[df_so['is_goal'] == 'No Goal'].head(4)

# Get the first goal attempt
first_goal_sample = df_so[df_so['is_goal'] == 'Goal'].head(1)

# Concatenate them to ensure at least one goal is in the sample of 5
combined_samples = pd.concat([non_goal_samples, first_goal_sample])

# Display the selected columns
print(combined_samples[['shooter_name', 'shooter_handedness', 'goalie_name', 'goalie_handedness', 'shot_type', 'is_goal']])

    shooter_name shooter_handedness      goalie_name goalie_handedness  \
0    Jakub Vrana                  L  Magnus Hellberg                 L   
1     Pius Suter                  L       Joel Hofer                 L   
2   Jordan Kyrou                  R  Magnus Hellberg                 L   
3   David Perron                  R       Joel Hofer                 L   
7  Lucas Raymond                  R       Joel Hofer                 L   

  shot_type  is_goal  
0      deke  No Goal  
1     wrist  No Goal  
2      deke  No Goal  
3     wrist  No Goal  
7     wrist     Goal  


##3. Exploratory Data Analysis