# Web Scraping Project
### Michael Benno
### 11/12/25

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### How many instances are there of players playing at least 83 games in a single regular season in the NHL and which players have the most?
https://www.hockey-reference.com/leaders/games_played_season.html

In [None]:
#Read in the website and see if I can access it.
url='https://www.hockey-reference.com/leaders/games_played_season.html'
response = requests.get(url)
status = response.status_code
if status == 200:
    page = response.text
    soup = bs(page)
    print("It worked!")
else:
    print(f"Error: Received status code {status}")

In [None]:
#Create some of the variables to collect the data and also gather the part of the website I need.
player_list = [ ]
stats_table = soup.find(id='all_stats_NHL').tbody

In [None]:
#Narrow down the website even further and collect the information I need
for row in stats_table.find_all('tr'):
    gamesPlayed = row.findAll(class_={"right"})[1].text
    if gamesPlayed == '82':
        break
    player_list.append({'Games_Played':int(gamesPlayed)})
print(player_list)

In [None]:
#Converts the dictionary into a dataframe
player_df = pd.DataFrame(player_list)
player_df

In [None]:
#Gets the counts of the games played
counts = player_df.Games_Played.value_counts(ascending = False)
counts

In [None]:
#Gemini helped me get these 2 lines of code together
df = counts.reset_index()
df.columns = ['Games_Played', 'Player_Count']

In [None]:
#Plot the data
plot = sns.barplot(x='Games_Played',
                   y='Player_Count',
                   palette = sns.color_palette('bright'),
                   data=df)
#Did some research to learn how to set the title
plot.set_title('Total Number Of Players Vs Number Of Games Played In A Season')
plt.show()

In [None]:
len(player_list) #Sees how many total there are

In [None]:
#Gets the player names and prints the top few
players = [ ]
for row in stats_table.find_all('tr'):
    Player = row.find(class_={'left'}).text
    players.append({'Player':Player})
pd.DataFrame(players).head()

#### There are 185 instances where players played at least 83 games in a season and Jimmy Carson, Bob Kudelski, and Glenn Anderson have the most amongst all players.

# For each player, what is the percentage of their entries among the top 50 single-season assist totals?
https://www.hockey-reference.com/leaders/assists_season.html

In [None]:
#Read in the website and see if I can access it.
url='https://www.hockey-reference.com/leaders/assists_season.html'
response = requests.get(url)
status = response.status_code
if status == 200:
    page = response.text
    soup = bs(page)
    print("It worked!")
else:
    print(f"Error: Received status code {status}")

In [None]:
#Create some of the variables to collect the data and also gather the part of the website I need.
player_list = [ ]
stats_table = soup.find(id='all_stats_NHL').tbody

In [None]:
#Gets all the information from the website that I need and modifies it on the spot for easier readability
for row in stats_table.find_all('tr'):
    rank = row.findAll(class_={"right"})[0].text
    if rank == '57.':
        break
    playerName = row.find(class_={"left"}).text
    if '*' in playerName:
        playerName = playerName.replace('*', '')
    player_list.append({'Player_Name':playerName})
print(player_list)

In [None]:
#Turns the information into a dataframe
playersDF = pd.DataFrame(player_list)
playersDF

In [None]:
#Gets the value counts of each data point
counts = playersDF.Player_Name.value_counts(ascending = False)
counts

In [None]:
#Converts the counts into a dictionary
counts_dict = counts.to_dict()
counts_dict

### This link helped me for this plot
https://www.geeksforgeeks.org/python/how-to-create-a-pie-chart-in-seaborn/#

In [None]:
#Plots the data
plt.pie(list(counts_dict.values()),
        labels=list(counts_dict.keys()),
        colors = sns.color_palette('bright'),
        autopct='%.0f%%')

#### Wayne Gretzky makes up 25% of the top 50, Mario Lemieux makes up 11% of the top 50, Bobby Orr makes up 7% of the top 50, while Nikita Kucherov, Paul Coffey, and Denis Savard make up 5% each of the top 50.
#### Connor McDavid, Joe Thornton, Adam Oates, Bobby Clarke, Doug Gilmour, and Nathan MacKinnon each make up 4% each of the top 50.
#### Ron Francis, Pat LaFontaine, Peter Stastny, Steve Yzerman, Bryan Trottier, Jaromir Jagr, Peter Forsberg, Jonathan Huberdeau, Marcel Dionne, Mark Messier, and Sidney Crosby make up 1% each of the top 50.

# How many unique goalies are in the top 100 in total saves in a single season in the NHL?
https://www.hockey-reference.com/leaders/saves_season.html

In [None]:
#Read in the website and see if I can access it.
url='https://www.hockey-reference.com/leaders/saves_season.html'
response = requests.get(url)
status = response.status_code
if status == 200:
    page = response.text
    soup = bs(page)
    print("It worked!")
else:
    print(f"Error: Received status code {status}")

In [None]:
#Create some of the variables to collect the data and also gather the part of the website I need.
player_list = [ ]
stats_table = soup.find(id='all_stats_NHL').tbody

In [None]:
#Gets all the information from the website that I need and modifies it on the spot for easier readability
for row in stats_table.find_all('tr'):
    rank = row.findAll(class_={"right"})[0].text
    if rank == '101.':
        break
    playerName = row.find(class_={"left"}).text
    if '*' in playerName:
        playerName = playerName.replace('*', '')
    player_list.append({'Player_Name':playerName})
print(player_list)

In [None]:
#Converts data into dataframe
playersDF = pd.DataFrame(player_list)
playersDF

In [None]:
#Gets the count of unique goalies
len(playersDF.Player_Name.unique())

#### There are 51 different goalies in the top 100 of single season saves.

# How many times does Wayne Gretzky appear in the top 51 of leaders in short handed goals in the regular season?

https://www.hockey-reference.com/leaders/goals_sh_season.html

In [None]:
#Read in the website and see if I can access it.
url='https://www.hockey-reference.com/leaders/goals_sh_season.html'
response = requests.get(url)
status = response.status_code
if status == 200:
    page = response.text
    soup = bs(page)
    print("It worked!")
else:
    print(f"Error: Received status code {status}")

In [None]:
#Create some of the variables to collect the data and also gather the part of the website I need.
player_list = [ ]
stats_table = soup.find(id='all_stats_NHL').tbody

In [None]:
#Gets all the information from the website that I need and modifies it on the spot for easier readability
for row in stats_table.find_all('tr'):
    rank = row.findAll(class_={"right"})[0].text
    if rank == '52.':
        break
    playerName = row.find(class_={"left"}).text
    if '*' in playerName:
        playerName = playerName.replace('*', '')
    player_list.append({'Player_Name':playerName})
print(player_list)

In [None]:
#Turns the information into a dataframe
playersDF = pd.DataFrame(player_list)
playersDF

In [None]:
#Gets the value counts of each entry
playersDF.value_counts()

#### Wayne Gretzky appears three times in the top 51 of leaders in short handed goals in the regular season.