In [101]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
import pandas as pd
from extract import open_driver, close_driver, wait_for_element
from transforms import parse_record, parse_game_date

from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait

In [103]:
def download(url, driver):
    driver.get(url)
    root_element = wait_for_element(source=driver, search_by=By.ID, target="root", unique_element=True)
    data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)
    
    table_headers = get_table_columns(data_table)
    table_data = get_table_rows(data_table, table_headers)
    
    return table_headers, table_data

In [104]:
def get_table_columns(source):
    table_headers = wait_for_element(source=source, search_by=By.CLASS_NAME, target="tableHeaderDiv", unique_element=True)
    headers = wait_for_element(source=table_headers, search_by=By.CLASS_NAME, target="rt-th")
    header_names = [header.text for header in headers]
    
    return header_names

In [105]:
def get_table_rows(source, headers):
    table_rows = wait_for_element(source=source, search_by=By.CLASS_NAME, target="rt-tbody", unique_element=True)
    rows = wait_for_element(source=table_rows, search_by=By.CLASS_NAME, target="rt-tr")
    table_data = []
    
    for row in rows:
        cells = wait_for_element(source=row, search_by=By.CLASS_NAME, target="rt-td")
        row_values = [cell.text for cell in cells]
        
        values_map = list(zip(headers, row_values))
        map_dict = parse_record(values_map)
        
        table_data.append(map_dict)
    
    return table_data

In [106]:
def scrape_page(url):
    driver = open_driver()
    headers, table_data = download(url, driver)
    close_driver(driver)
    
    return headers, table_data

In [107]:
url = "https://www.nhl.com/stats/teams?aggregate=0&reportType=game&dateFrom=2021-10-12&dateTo=2022-04-26&gameType=2&filter=gamesPlayed,gte,1&sort=points,wins&page=0&pageSize=50"

In [108]:
headers, table_data = scrape_page(url=url)

2021/12/29@ BUF
2021/11/06@ SJS
2021/10/23vs BUF
2021/12/08vs PHI
2021/11/20@ TBL
2022/01/06vs CBJ
2022/03/22vs NYR
2022/02/28vs VAN
2021/11/09vs FLA
2022/02/24@ PIT
2022/04/12@ ARI
2022/01/02@ WSH
2021/10/30@ PIT
2022/03/12vs ANA
2022/03/06vs STL
2021/11/11vs NYI
2021/11/28vs PHI
2022/04/09@ DAL
2022/03/27vs MTL
2022/02/10@ STL
2022/04/18@ VGK
2022/03/08vs COL
2021/10/19vs SEA
2021/10/15vs CHI
2022/02/08@ MTL
2021/12/31vs EDM
2022/01/22vs CAR
2022/02/09@ VAN
2022/04/12vs PIT
2022/01/17vs PHI
2022/03/17@ NYR
2022/01/25vs PHI
2021/10/24@ VGK
2021/11/04@ MTL
2021/12/30vs BUF
2022/03/31vs CBJ
2022/04/01@ NYR
2022/03/10vs CBJ
2022/03/29@ CBJ
2022/04/15@ MTL
2021/12/07@ OTT
2021/12/16vs BOS
2022/03/24vs DET
2021/11/06@ WPG
2022/04/08@ CAR
2022/02/27@ ANA
2021/10/19@ CHI
2022/02/17vs BOS
2022/01/01vs EDM
2022/01/21vs ARI


In [109]:
table_data

[{'i': 1,
  'opponent': 'BUF',
  'home_game': False,
  'game_date': datetime.datetime(2021, 12, 29, 0, 0),
  'GP': 1,
  'W': 1,
  'L': 0,
  'OT': 0,
  'P': 2,
  'P%': 1.0,
  'RW': 1,
  'ROW': 1,
  'S/O Win': 0,
  'GF': 4,
  'GA': 3,
  'GF/GP': 4.0,
  'GA/GP': 3.0,
  'PP%': 50.0,
  'PK%': 100.0,
  'Net PP%': 50.0,
  'Net PK%': 100.0,
  'Shots/GP': 42.0,
  'SA/GP': 22.0,
  'FOW%': 51.3},
 {'i': 2,
  'opponent': 'SJS',
  'home_game': False,
  'game_date': datetime.datetime(2021, 11, 6, 0, 0),
  'GP': 1,
  'W': 1,
  'L': 0,
  'OT': 0,
  'P': 2,
  'P%': 1.0,
  'RW': 0,
  'ROW': 0,
  'S/O Win': 1,
  'GF': 2,
  'GA': 2,
  'GF/GP': 2.0,
  'GA/GP': 2.0,
  'PP%': 0.0,
  'PK%': 66.7,
  'Net PP%': 0.0,
  'Net PK%': 66.7,
  'Shots/GP': 28.0,
  'SA/GP': 27.0,
  'FOW%': 54.2},
 {'i': 3,
  'opponent': 'BUF',
  'home_game': True,
  'game_date': datetime.datetime(2021, 10, 23, 0, 0),
  'GP': 1,
  'W': 1,
  'L': 0,
  'OT': 0,
  'P': 2,
  'P%': 1.0,
  'RW': 0,
  'ROW': 1,
  'S/O Win': 0,
  'GF': 2,
  'GA'