In [5]:
# imports
import os
import pandas as pd

In [6]:
def convert_xls_to_csv(xls_file, csv_file, output_folder):
    try:
        # Check if the file is HTML by reading the first few bytes
        with open(xls_file, 'rb') as f:
            start_bytes = f.read(5)
            if start_bytes == b'<?xml':  # Check for XML format
                df = pd.read_excel(xls_file, sheet_name=None, engine='openpyxl')
            elif start_bytes[:5] == b'<html':  # Check for HTML
                df = pd.read_html(xls_file)
                for i, table in enumerate(df):
                    table.to_csv(os.path.join(output_folder, f"{csv_file}.csv"), index=False)
                return  # Exit the function after processing HTML
            else:
                df = pd.read_excel(xls_file, sheet_name=None, engine='xlrd')  # Try xlrd for .xls

    except Exception as e:
        print(f"Error processing {xls_file}: {e}")
        return

    # Convert each sheet to a CSV file
    for sheet_name, data in df.items():
        print(f"Converting sheet: {sheet_name}")
        # Save the sheet to a CSV file in the output folder
        data.to_csv(os.path.join(output_folder, f"{csv_file}.csv"), index=False)

    print(f"Conversion complete for {xls_file}.")

def convert_all_xls_in_folder(folder_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xls') or file_name.endswith('.xlsx'):
            xls_file = os.path.join(folder_path, file_name)
            csv_file = file_name.split('.')[0]  # File name without extension
            convert_xls_to_csv(xls_file, csv_file, output_folder)

In [7]:
# convert season stats to csv
folder_path = 'season stats'
output_folder = 'season stats csv'
convert_all_xls_in_folder(folder_path, output_folder)

In [12]:
# Function to get df of games from season
def get_df_games(url, year):
    # get tables from webpage
    tables = pd.read_html(url)
    
    # first table has regular season data
    games = tables[0]
    
    # rename columns
    games = games.rename(columns={
        'G': 'G_Visitor',
        'G.1': 'G_Home',
        'Unnamed: 6': 'OT_SO',
        'Att.': 'Attendance'
    })
    
    # convert date entries to datetime
    games['Date'] = pd.to_datetime(games['Date'])
    
    # add weekday column
    games['Day'] = games['Date'].dt.day_name()
    
    # convert time entries to datetime
    games['Time'] = pd.to_datetime(games['Time'], format='%I:%M %p').dt.time

    # Create folder 'season results' if it doesn't exist
    folder_path = 'season results'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Save the dataframe as a CSV file
    season_str = f"{year - 1}-{str(year)[2:]}"
    file_name = f"{season_str} season results.csv"
    file_path = os.path.join(folder_path, file_name)
    games.to_csv(file_path, index=False)

    print(f"Season results saved as {file_path}")

In [14]:
url_base = 'https://www.hockey-reference.com/leagues/NHL_{year}_games.html'

for year in range(2014, 2025):
    url = url_base.format(year = year)
    get_df_games(url, year)

Season results saved as season results/2013-14 season results.csv
Season results saved as season results/2014-15 season results.csv
Season results saved as season results/2015-16 season results.csv
Season results saved as season results/2016-17 season results.csv
Season results saved as season results/2017-18 season results.csv
Season results saved as season results/2018-19 season results.csv
Season results saved as season results/2019-20 season results.csv
Season results saved as season results/2020-21 season results.csv
Season results saved as season results/2021-22 season results.csv
Season results saved as season results/2022-23 season results.csv
Season results saved as season results/2023-24 season results.csv
