In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import pandas as pd
from extract import open_driver, close_driver, wait_for_element
from transforms import parse_record, parse_game_date, game_season
from selenium.webdriver.common.by import By
from DML import db_connect, close_db_connect
from psycopg2.extras import execute_batch
from sql_queries import summary_table_insert
import urllib.parse
from datetime import datetime

In [24]:
def get_table_columns(table_element):
    header_container = wait_for_element(source=table_element, search_by=By.CLASS_NAME, target="table-header-container", unique_element=True)
    header_elements = wait_for_element(source=header_container, search_by=By.CLASS_NAME, target="rt-th")
    header_names = [header.text for header in header_elements]
    
    return header_names

def get_table_rows(table_element):
    table_rows = wait_for_element(source=table_element, search_by=By.CLASS_NAME, target="rt-tbody", unique_element=True)
    rows = wait_for_element(source=table_rows, search_by=By.CLASS_NAME, target="rt-tr")
    
    return rows

In [25]:
class Scraper:
    def __init__(self):
        self.driver = None
        self.conn = None
        self.cur = None
        self.table_data = None
        self.table_headers = None
        self.table_insert = ""
        self.staged_data = []
        
        self.url_dict = {
            "aggregate": 0,
            "reportType": "game",
            "pageSize": 100
        }
    
    def open_driver(self):
        self.driver = open_driver()
    
    def close_driver(self):
        if self.driver:
            close_driver(self.driver)
            
    def db_connect(self):
        self.conn, self.cur = db_connect()
    
    def close_db_connect(self):
        if self.conn and self.cur:
            close_db_connect(self.conn, self.cur)
            
    def extract(self, url):
        """sends requests to the url and scrapes the table elements (header and rows)"""
        self.url_dict["page"] = 0
        
        while True:
            url = self.base_url + urllib.parse.urlencode(self.url_dict)

            self.driver.get(url)
            # scrape the page's main table
            root_element = wait_for_element(source=self.driver, search_by=By.ID, target="root", unique_element=True)
        
            pagination = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="pagination", unique_element=True)
            page_number_field = wait_for_element(source=pagination, search_by=By.TAG_NAME, target="input", unique_element=True)
            total_pages = int(page_number_field.get_attribute('max'))
            
#             print("Total pages: ", total_pages)
#             print("Current page: ", self.url_dict["page"])

            data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)
            
            header_elements = get_table_columns(data_table)
            row_elements = get_table_rows(data_table)
            
            self.url_dict["page"] += 1
            
            if self.url_dict["page"] > total_pages:
                break
        
        
#         print(total_pages)
        
#         data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)

#         header_elements = get_table_columns(data_table)
#         row_elements = get_table_rows(data_table)
        
#         return header_elements, row_elements
    

    def load(self):
        execute_batch(self.cur, self.table_insert, self.staged_data)

In [26]:
class SummaryScraper(Scraper):
    def __init__(self):
        super().__init__()
        self.table_insert = summary_table_insert
        self.base_url = "https://www.nhl.com/stats/teams?"
        self.url_dict = {
            "aggregate": 0,
            "reportType": "game",
            "page": 0,
            "pageSize": 100,
            "gameType": 2
        }
    
    
    def build_url(self, start=None, end=None):
        if (isinstance(start, str) and start.isnumeric()) or isinstance(start, int):
            start = int(start)
            self.url_dict["seasonFrom"] = f"{start}{start+1}"
            self.url_dict["seasonTo"] = f"{start}{start+1}"
            
            self.url_dict["dateFromSeason"] = []
            
        elif isinstance(start, datetime):            
            self.url_dict["dateFrom"] = start.strftime("%Y-%m-%d")
            
            if isinstance(end, datetime):
                self.url_dict["dateTo"] = end.strftime("%Y-%m-%d")
            else:
                self.url_dict["dateTo"] = datetime.now().date()        
    
    
    def transform(self, headers, row_elements):        
        for row in row_elements:
            row_cells = wait_for_element(source=row, search_by=By.CLASS_NAME, target="rt-td")
            row_values = [cell.text for cell in row_cells]
            
            values_map = list(zip(headers, row_values))
            map_dict = parse_record(values_map)
            
            self.cur.execute("SELECT abbreviation FROM teams WHERE team = %s;", (map_dict["Team"],))
            team_abbreviation = self.cur.fetchone()[0]
            
            ordered_data_list = [
                team_abbreviation,
                map_dict["season"],
                map_dict["game_date"],
                map_dict["home_game"],
                map_dict["opponent"],
                map_dict["W"],
                map_dict["L"],
                map_dict["T"],
                map_dict["OT"],
                map_dict["P"],
                map_dict["P%"],
                map_dict["RW"],
                map_dict["ROW"],
                map_dict["S/O Win"],
                map_dict["GF"],
                map_dict["GA"],
                map_dict["PP%"],
                map_dict["PK%"],
                map_dict["Net PP%"],
                map_dict["Net PK%"],
                map_dict["Shots/GP"],
                map_dict["GA/GP"],
                map_dict["FOW%"]
            ]
                    
            self.staged_data.append(ordered_data_list)
            
            
    def extract(self, url):
        """sends requests to the url and scrapes the table elements (header and rows)"""
        self.url_dict["page"] = 0
        
        while True:
            url = self.base_url + urllib.parse.urlencode(self.url_dict)

            self.driver.get(url)
            # scrape the page's main table
            root_element = wait_for_element(source=self.driver, search_by=By.ID, target="root", unique_element=True)
        
            pagination = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="pagination", unique_element=True)
            page_number_field = wait_for_element(source=pagination, search_by=By.TAG_NAME, target="input", unique_element=True)
            total_pages = int(page_number_field.get_attribute('max'))
            
#             print("Total pages: ", total_pages)
#             print("Current page: ", self.url_dict["page"])

            data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)
            
            header_elements = get_table_columns(data_table)
            row_elements = get_table_rows(data_table)
            
            self.transform(table_headers, table_rows)
            
            
            self.url_dict["page"] += 1
            
            if self.url_dict["page"] > total_pages:
                break
                
    
    def etl(self, start):
        # the chrome driver and database cursor are used in multiple scripts
        self.open_driver()
        self.db_connect()
        
        self.build_url(start=start)
        self.extract(start)
#         url = self.build_url(start=start)
#         table_headers, table_rows = self.extract(url)

#         self.transform(table_headers, table_rows)        
        
#         execute batch loading
#         self.load()
        
        # close the driver and the cursor
        self.close_driver()
        self.close_db_connect()
        
#         return parsed_data

In [27]:
scraper = SummaryScraper()

In [28]:
summary_table_data = scraper.etl(start=2020)

https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=0&pageSize=100&gameType=2&seasonFrom=20202021&seasonTo=20202021&dateFromSeason=%5B%5D
Total pages:  18
Current page:  0
https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=1&pageSize=100&gameType=2&seasonFrom=20202021&seasonTo=20202021&dateFromSeason=%5B%5D
Total pages:  18
Current page:  1
https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=2&pageSize=100&gameType=2&seasonFrom=20202021&seasonTo=20202021&dateFromSeason=%5B%5D
Total pages:  18
Current page:  2
https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=3&pageSize=100&gameType=2&seasonFrom=20202021&seasonTo=20202021&dateFromSeason=%5B%5D
Total pages:  18
Current page:  3
https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=4&pageSize=100&gameType=2&seasonFrom=20202021&seasonTo=20202021&dateFromSeason=%5B%5D
Total pages:  18
Current page:  4
https://www.nhl.com/stats/teams?aggregate=0&reportType=game&page=5&pageSize

AttributeError: 'list' object has no attribute 'find_element'