In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from extract import open_driver, close_driver, wait_for_element
from transforms import parse_record, parse_game_date
from selenium.webdriver.common.by import By
from DML import db_connect, close_db_connect
from sql_queries import summary_table_insert
from psycopg2.extras import execute_batch

In [3]:
def download(url, driver):
    driver.get(url)
    root_element = wait_for_element(source=driver, search_by=By.ID, target="root", unique_element=True)
    data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)
    
    page_jump = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="-pageJump", unique_element=True)
    page_number = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="-totalPages", unique_element=True)
    
    table_headers = get_table_columns(data_table)
    table_data = get_table_rows(data_table, table_headers)
    
    return table_data

In [4]:
def get_table_columns(source):
    table_headers = wait_for_element(source=source, search_by=By.CLASS_NAME, target="tableHeaderDiv", unique_element=True)
    headers = wait_for_element(source=table_headers, search_by=By.CLASS_NAME, target="rt-th")
    header_names = [header.text for header in headers]
    
    return header_names

In [5]:
def get_table_rows(source, headers):
    table_rows = wait_for_element(source=source, search_by=By.CLASS_NAME, target="rt-tbody", unique_element=True)
    rows = wait_for_element(source=table_rows, search_by=By.CLASS_NAME, target="rt-tr")
    table_data = []
    
    for row in rows:
        cells = wait_for_element(source=row, search_by=By.CLASS_NAME, target="rt-td")
        row_values = [cell.text for cell in cells]
        
        values_map = list(zip(headers, row_values))
        map_dict = parse_record(values_map)
        
        ordered_data_list = [
            map_dict["Team"],
            "2021/22",
            map_dict["game_date"],
            map_dict["home_game"],
            map_dict["opponent"],
            map_dict["W"],
            map_dict["L"],
            map_dict["T"],
            map_dict["OT"],
            map_dict["P"],
            map_dict["P%"],
            map_dict["RW"],
            map_dict["ROW"],
            map_dict["S/O Win"],
            map_dict["GF"],
            map_dict["GA"],
            map_dict["PP%"],
            map_dict["PK%"],
            map_dict["Net PP%"],
            map_dict["Net PK%"],
            map_dict["Shots/GP"],
            map_dict["GA/GP"],
            map_dict["FOW%"]
        ]
        
        table_data.append(ordered_data_list)
        
    return table_data

In [6]:
def scrape_page(url):
    driver = open_driver()
    table_data = download(url, driver)
    close_driver(driver)
    
    # call batch upload function
    conn, cur = db_connect()
    execute_batch(cur, summary_table_insert, table_data) 
    close_db_connect(conn, cur)

In [8]:
url = "https://www.nhl.com/stats/teams?aggregate=0&reportType=game&seasonFrom=20212022&seasonTo=20212022&dateFromSeason&gameType=2&filter=gamesPlayed,gte,1&sort=points,wins&page=0&pageSize=100"

In [9]:
scrape_page(url=url)

['New Jersey Devils', '2021/22', datetime.datetime(2022, 4, 18, 0, 0), False, 'VGK', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 3, 2, 0.0, 100.0, 0.0, 100.0, 28.0, 2.0, 49.2]
['New Jersey Devils', '2021/22', datetime.datetime(2021, 12, 8, 0, 0), True, 'PHI', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 3, 0, 33.3, 100.0, 33.3, 100.0, 27.0, 0.0, 32.2]
['New Jersey Devils', '2021/22', datetime.datetime(2021, 12, 29, 0, 0), False, 'BUF', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 4, 3, 50.0, 100.0, 50.0, 100.0, 42.0, 3.0, 51.3]
['New Jersey Devils', '2021/22', datetime.datetime(2022, 2, 8, 0, 0), False, 'MTL', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 7, 1, 50.0, 100.0, 50.0, 100.0, 34.0, 1.0, 50.8]
['New Jersey Devils', '2021/22', datetime.datetime(2022, 2, 28, 0, 0), True, 'VAN', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 7, 2, 50.0, 100.0, 50.0, 100.0, 33.0, 2.0, 47.7]
['New Jersey Devils', '2021/22', datetime.datetime(2022, 4, 9, 0, 0), False, 'DAL', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 3, 1, 0.0, 83.3, 0.0, 83.3, 23.0, 1.0, 44.8]
['New Jersey 

['New York Islanders', '2021/22', datetime.datetime(2022, 4, 26, 0, 0), False, 'WSH', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 4, 1, 100.0, 100.0, 100.0, 125.0, 26.0, 1.0, 47.9]
['New York Islanders', '2021/22', datetime.datetime(2022, 4, 15, 0, 0), False, 'MTL', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 3, 0, 0.0, 100.0, 0.0, 100.0, 20.0, 0.0, 39.6]
['New York Islanders', '2021/22', datetime.datetime(2021, 12, 30, 0, 0), True, 'BUF', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 4, 1, 50.0, 50.0, 50.0, 50.0, 32.0, 1.0, 66.0]
['New York Islanders', '2021/22', datetime.datetime(2022, 4, 8, 0, 0), False, 'CAR', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 2, 1, 0.0, 100.0, 0.0, 100.0, 21.0, 1.0, 43.2]
['New York Islanders', '2021/22', datetime.datetime(2022, 4, 12, 0, 0), True, 'PIT', 1, 0, None, 0, 2, 1.0, 0, 0, 1, 4, 4, 33.3, 80.0, 33.3, 100.0, 37.0, 4.0, 56.9]
['New York Islanders', '2021/22', datetime.datetime(2022, 3, 13, 0, 0), True, 'ANA', 1, 0, None, 0, 2, 1.0, 1, 1, 0, 4, 3, 0.0, 100.0, 0.0, 100.0, 29.0, 3.0, 55.4]
['New Yo