In [1]:
import hashlib
import io
import logging
import os
import time
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
logging.basicConfig(format='%(asctime)s %(levelname)s %(process)d --- %(name)s %(funcName)20s() : %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)

In [42]:
class GearScraper:
    logger = logging.getLogger('GearScraper')

    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
    
    def parse(self, gender):
        
        board_list = []
        
        url_list = self.__get_boards_url(gender, self.driver)
        for url in tqdm(url_list[:2], desc="Getting ratings..."):
            ratings = self.__get_ratings(url, self.driver)
            ratings['meta_data'] = self.__get_meta_data(url, self.driver)
            board_list.append(ratings)
        self.logger.info("All boards have been processed.")
#         driver.close()            

        return board_list
        
    def __get_boards_url(self, gender, browser):
        search_url = f"https://thegoodride.com/snowboard-reviews/?{gender}=1"        
        board_urls = []
        
        browser.get(search_url)
        # click and wait
        button = browser.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
        browser.execute_script("arguments[0].click();", button)
        self.logger.info(f"Loading main page for {gender} boards...")
        time.sleep(10)

        elems = browser.find_elements_by_xpath('//*[@id="applications"]/*/a[@href]')
        for elem in tqdm(elems, desc="Fetching links "):
            board_urls.append(elem.get_attribute("href"))

        return board_urls
    
    def __get_ratings(self, url, driver):
        rating_dict = {}
        driver.get(url)

        # top-right table
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[1]/table/tbody/*')
        for row in elems:
            k = row.find_elements_by_tag_name("td")[0]
            v = row.find_elements_by_tag_name("td")[1]
            rating_dict[k.text] = v.text

        # top-left table
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[2]/table/tbody/*')
        for item in [e.text.split() for e in elems]:
            rating_dict[item[0]] = item[1]

        # bottom table
        b_table = []
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[3]/*/table/tbody/*')
        for item in [e.text.split() for e in elems]:
            rating_dict[item[0]] = item[1]
        return rating_dict
    
    def __get_meta_data(self, url, driver):
        
        meta_dict = {}
        
        driver.get(url)
        meta_dict['name'] = url.rsplit('/', 1)[1]
        meta_dict['url'] = url
        
        # Get price data
        for e in driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[1]/div/div[1]'):
            meta_dict['price'] = e.text.rsplit(' ', 1)[1]
        
        # get image url
        for e in driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[1]/div/div[2]/a'):
            meta_dict['image_url'] = e.get_attribute("href")
            
        return meta_dict

In [43]:
scrpr = GearScraper()



18-Jun-21 01:16:49 INFO 26884 --- WDM                  log() : 

Current google-chrome version is 91.0.4472
18-Jun-21 01:16:49 INFO 26884 --- WDM                  log() : Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
18-Jun-21 01:16:49 INFO 26884 --- WDM                  log() : Get LATEST driver version for 91.0.4472
Driver [/Users/imaniai/.wdm/drivers/chromedriver/mac64/91.0.4472.101/chromedriver] found in cache
18-Jun-21 01:16:49 INFO 26884 --- WDM                  log() : Driver [/Users/imaniai/.wdm/drivers/chromedriver/mac64/91.0.4472.101/chromedriver] found in cache


In [44]:
boards_url_list = scrpr.parse('mens')

18-Jun-21 01:16:56 INFO 26884 --- GearScraper     __get_boards_url() : Loading main page for mens boards...
Fetching links : 100%|██████████| 434/434 [00:01<00:00, 269.39it/s]
Getting ratings...: 100%|██████████| 2/2 [00:10<00:00,  5.15s/it]
18-Jun-21 01:17:18 INFO 26884 --- GearScraper                parse() : All boards have been processed.


In [45]:
boards_url_list

[{'Overall Rating': 'Liked it!',
  'Riding Style': 'All Mountain Freestyle',
  'Riding Level': 'Beginner - Expert',
  'Fits Boot size (US)': '8-10',
  'Manufactured in': 'Tunisia by Nidecker',
  'Shape': 'True Twin',
  'Camber Profile': 'Hybrid Camber',
  'Stance': 'Centered',
  'Approx. Weight': 'Feels Normal',
  'Powder': 'Average',
  'Turning': 'Experience',
  'Carving': 'Good',
  'Speed': 'Great',
  'Uneven': 'Terrain',
  'Switch': 'Great',
  'Jumps': 'Good',
  'Jibbing': 'Average',
  'Pipe': 'Great',
  'On': 'Snow',
  'Turn': 'Initiation',
  'Skidded': 'Turns',
  'Flex': 'Medium',
  'Buttering': 'Semi-Easy',
  'Edge': 'Hold',
  'meta_data': {'name': 'alloy-b-bomb-2016-2020-snowboard-review',
   'url': 'https://thegoodride.com/snowboard-reviews/alloy-b-bomb-2016-2020-snowboard-review',
   'price': '$549',
   'image_url': 'https://thegoodride.com/assets/lg-gallery/alloy/snowboards/b-bomb/alloy-b-bomb.jpg'}},
 {'Overall Rating': '',
  'Riding Style': 'Freestyle',
  'Riding Level': 'I