In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
import pandas as pd
from extract import open_driver, close_driver, wait_for_element
from transforms import parse_record, parse_game_date, game_season
from selenium.webdriver.common.by import By
from DML import db_connect, close_db_connect
from psycopg2.extras import execute_batch
from sql_queries import summary_table_insert
import urllib.parse
from datetime import datetime
import time

In [45]:
def get_table_columns(table_element):
    header_container = wait_for_element(source=table_element, search_by=By.CLASS_NAME, target="table-header-container", unique_element=True)
    header_elements = wait_for_element(source=header_container, search_by=By.CLASS_NAME, target="rt-th")
    header_names = [header.text for header in header_elements]
    
    return header_names

def get_table_rows(table_element):
    table_rows = wait_for_element(source=table_element, search_by=By.CLASS_NAME, target="rt-tbody", unique_element=True)
    rows = wait_for_element(source=table_rows, search_by=By.CLASS_NAME, target="rt-tr")
    
    return rows

In [46]:
class Scraper:
    def __init__(self):
        self.driver = None
        self.conn = None
        self.cur = None
        self.table_data = None
        self.table_headers = None
        self.table_insert = ""
        self.staged_data = []
        
        self.url_dict = {
            "aggregate": 0,
            "reportType": "game",
            "pageSize": 100
        }
    
    def open_driver(self):
        self.driver = open_driver()
    
    def close_driver(self):
        if self.driver:
            close_driver(self.driver)
            
    def db_connect(self):
        self.conn, self.cur = db_connect()
    
    def close_db_connect(self):
        if self.conn and self.cur:
            close_db_connect(self.conn, self.cur)

    def load(self):
        execute_batch(self.cur, self.table_insert, self.staged_data)
        print(f"Loaded {len(self.staged_data)} records")

In [47]:
class SummaryScraper(Scraper):
    def __init__(self):
        super().__init__()
        self.table_insert = summary_table_insert
        self.base_url = "https://www.nhl.com/stats/teams?"
        self.url_dict = {
            "aggregate": 0,
            "reportType": "game",
            "page": 0,
            "pageSize": 100,
            "gameType": 2
        }
    
    
    def build_url(self, start=None, end=None):
        if (isinstance(start, str) and start.isnumeric()) or isinstance(start, int):
            start = int(start)
            self.url_dict["seasonFrom"] = f"{start}{start+1}"
            self.url_dict["seasonTo"] = f"{start}{start+1}"
            
            self.url_dict["dateFromSeason"] = []
            
        elif isinstance(start, datetime):            
            self.url_dict["dateFrom"] = start.strftime("%Y-%m-%d")
            
            if isinstance(end, datetime):
                self.url_dict["dateTo"] = end.strftime("%Y-%m-%d")
            else:
                self.url_dict["dateTo"] = datetime.now().date()        
    
    
    def transform(self, headers, row_elements):        
        for row in row_elements:
            row_cells = wait_for_element(source=row, search_by=By.CLASS_NAME, target="rt-td")
            row_values = [cell.text for cell in row_cells]
            
            values_map = list(zip(headers, row_values))
            map_dict = parse_record(values_map)

            self.cur.execute("SELECT abbreviation FROM teams WHERE team = %s;", (map_dict["Team"],))
            team_abbreviation = self.cur.fetchone()[0]
            
            ordered_data_list = [
                team_abbreviation,
                map_dict["season"],
                map_dict["game_date"],
                map_dict["home_game"],
                map_dict["opponent"],
                map_dict["W"],
                map_dict["L"],
                map_dict["T"],
                map_dict["OT"],
                map_dict["P"],
                map_dict["P%"],
                map_dict["RW"],
                map_dict["ROW"],
                map_dict["S/O Win"],
                map_dict["GF"],
                map_dict["GA"],
                map_dict["PP%"],
                map_dict["PK%"],
                map_dict["Net PP%"],
                map_dict["Net PK%"],
                map_dict["Shots/GP"],
                map_dict["GA/GP"],
                map_dict["FOW%"]
            ]
                    
            self.staged_data.append(ordered_data_list)
            
    def extract(self, url):
        """sends requests to the url and scrapes the table elements (header and rows)"""
        self.url_dict["page"] = 0
        self.staged_data = []
        errors = 0
        
        while True:
            url = self.base_url + urllib.parse.urlencode(self.url_dict)

            self.driver.get(url)
            try:
                # scrape the page's main table
                root_element = wait_for_element(source=self.driver, search_by=By.ID, target="root", unique_element=True)

                pagination = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="pagination", unique_element=True)
                page_number_field = wait_for_element(source=pagination, search_by=By.TAG_NAME, target="input", unique_element=True)
                total_pages = int(page_number_field.get_attribute('max'))

                data_table = wait_for_element(source=root_element, search_by=By.CLASS_NAME, target="rt-table", unique_element=True)

                table_headers = get_table_columns(data_table)
                row_elements = get_table_rows(data_table)

                self.transform(table_headers, row_elements)

                self.url_dict["page"] += 1

                errors = 0

            except AttributeError:
                errors += 1

                if errors == 3:
                    self.url_dict["page"] += 1
                    print("Skipped page ", self.url_dict["page"])
            
            if self.url_dict["page"] >= total_pages:
                break
                    
    def etl(self, start):
        # the chrome driver and database cursor are used in multiple scripts
        self.open_driver()
        self.db_connect()
        
        self.build_url(start=start)
        self.extract(start)
        self.load()
        
        # close the driver and the cursor
        self.close_driver()
        self.close_db_connect()

In [42]:
scraper = SummaryScraper()
for i in range(2017, 2018):
    scraper.etl(start=i)
    time.sleep(120)

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: headless chrome=101.0.4951.64)
Stacktrace:
0   chromedriver                        0x00000001061d43d9 chromedriver + 5104601
1   chromedriver                        0x0000000106164bf3 chromedriver + 4647923
2   chromedriver                        0x0000000105d55b48 chromedriver + 392008
3   chromedriver                        0x0000000105d40c12 chromedriver + 306194
4   chromedriver                        0x0000000105d3fb6a chromedriver + 301930
5   chromedriver                        0x0000000105d4006c chromedriver + 303212
6   chromedriver                        0x0000000105d5fa06 chromedriver + 432646
7   chromedriver                        0x0000000105d58498 chromedriver + 402584
8   chromedriver                        0x0000000105d5819e chromedriver + 401822
9   chromedriver                        0x0000000105d588d2 chromedriver + 403666
10  chromedriver                        0x0000000105d58bec chromedriver + 404460
11  chromedriver                        0x0000000105d855fe chromedriver + 587262
12  chromedriver                        0x0000000105da86c2 chromedriver + 730818
13  chromedriver                        0x0000000105d7f635 chromedriver + 562741
14  chromedriver                        0x0000000105da87ce chromedriver + 731086
15  chromedriver                        0x0000000105dbb091 chromedriver + 807057
16  chromedriver                        0x0000000105da85b3 chromedriver + 730547
17  chromedriver                        0x0000000105d7e139 chromedriver + 557369
18  chromedriver                        0x0000000105d7f165 chromedriver + 561509
19  chromedriver                        0x000000010619339d chromedriver + 4838301
20  chromedriver                        0x00000001061abcde chromedriver + 4938974
21  chromedriver                        0x00000001061b0b5e chromedriver + 4959070
22  chromedriver                        0x00000001061ac94a chromedriver + 4942154
23  chromedriver                        0x000000010618833c chromedriver + 4793148
24  chromedriver                        0x00000001061c63b8 chromedriver + 5047224
25  chromedriver                        0x00000001061c653f chromedriver + 5047615
26  chromedriver                        0x00000001061db705 chromedriver + 5134085
27  libsystem_pthread.dylib             0x00007ff80e7ff4e1 _pthread_start + 125
28  libsystem_pthread.dylib             0x00007ff80e7faf6b thread_start + 15
