In [1]:
import hashlib
import io
import logging
import os
import time
import hashlib
import base64
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
logging.basicConfig(format='%(asctime)s %(levelname)s %(process)d --- %(name)s %(funcName)20s() : %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)

In [3]:
class GearScraper:
    logger = logging.getLogger('GearScraper')

    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
    
    def parse(self, gender):
        
        board_list = []
        single_board_dict = {}
        
        url_list = self.__get_boards_url(gender, self.driver)
        for url in tqdm(url_list[:2], desc="Getting ratings..."):
            single_board_dict['id'] = self.__hashme(url)
            single_board_dict['ratings'] = self.__get_ratings(url, self.driver)
            single_board_dict['meta_data'] = self.__get_meta_data(url, self.driver)
            board_list.append(single_board_dict)
        self.logger.info("All boards have been processed.")
#         driver.close()            

        return board_list

    def __hashme(self, x):
        unique_id =  base64.b64encode(hashlib.sha1(x.encode('UTF-8')).digest())
        return str(unique_id)
        
    def __get_boards_url(self, gender, browser):
        search_url = f"https://thegoodride.com/snowboard-reviews/?{gender}=1"        
        board_urls = []
        
        browser.get(search_url)
        # click and wait
        button = browser.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
        browser.execute_script("arguments[0].click();", button)
        self.logger.info(f"Loading main page for \033[1m{gender}\033[0m boards...")
        time.sleep(10)

        elems = browser.find_elements_by_xpath('//*[@id="applications"]/*/a[@href]')
        for elem in tqdm(elems, desc="Fetching links "):
            board_urls.append(elem.get_attribute("href"))

        return board_urls
    
    def __get_ratings(self, url, driver):
        rating_dict = {}
        driver.get(url)

        # top-right table
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[1]/table/tbody/*')
        for row in elems:
            k = row.find_elements_by_tag_name("td")[0]
            v = row.find_elements_by_tag_name("td")[1]
            rating_dict[k.text] = v.text

        # top-left table
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[2]/table/tbody/*')
        for item in [e.text.split() for e in elems]:
            rating_dict[item[0]] = item[1]

        # bottom table
        b_table = []
        elems = driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[2]/div[3]/*/table/tbody/*')
        for item in [e.text.split() for e in elems]:
            rating_dict[item[0]] = item[1]
        return rating_dict
    
    def __get_meta_data(self, url, driver):
        
        meta_dict = {}
        
        driver.get(url)
        meta_dict['name'] = url.rsplit('/', 1)[1]
        meta_dict['url'] = url
        
        # Get price data
        for e in driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[1]/div/div[1]'):
            meta_dict['price'] = e.text.rsplit(' ', 1)[1]
        
        # get image url
        for e in driver.find_elements_by_xpath('//*[@id="post-"]/div[1]/div[1]/div/div[2]/a'):
            meta_dict['image_url'] = e.get_attribute("href")
            
        return meta_dict

In [4]:
scrpr = GearScraper()



18-Jun-21 13:17:23 INFO 67527 --- WDM                  log() : 

Current google-chrome version is 91.0.4472
18-Jun-21 13:17:24 INFO 67527 --- WDM                  log() : Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
18-Jun-21 13:17:24 INFO 67527 --- WDM                  log() : Get LATEST driver version for 91.0.4472
Driver [/Users/imaniai/.wdm/drivers/chromedriver/mac64/91.0.4472.101/chromedriver] found in cache
18-Jun-21 13:17:24 INFO 67527 --- WDM                  log() : Driver [/Users/imaniai/.wdm/drivers/chromedriver/mac64/91.0.4472.101/chromedriver] found in cache


In [5]:
boards_url_list = scrpr.parse('mens')

18-Jun-21 13:17:33 INFO 67527 --- GearScraper     __get_boards_url() : Loading main page for [1mmens[0m boards...
Fetching links : 100%|██████████| 434/434 [00:01<00:00, 263.97it/s]
Getting ratings...: 100%|██████████| 2/2 [00:12<00:00,  6.32s/it]
18-Jun-21 13:17:57 INFO 67527 --- GearScraper                parse() : All boards have been processed.


In [None]:
import logging
import boto3
from botocore.exceptions import ClientError

logging.basicConfig(format='%(asctime)s %(levelname)s %(process)d --- %(name)s %(funcName)20s() : %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)


def upload_object(object: bytes, bucket: str, key: str, content_type: str,
                  grant_read: str = None, metadata={}) -> bool:
    """
    Upload an image file to an S3 bucket
    object: The file in bytes to upload to s3
    bucket: Bucket to upload to
    key: The key to save the object as
    content_type: The ContentType of the object
    grant_read: Specify the read-access of the object, default is public read
    metadata: A dict to specify any metadata
    Returns:
        True if file was uploaded, else False
    """
    if grant_read is None:
        grant_read = 'uri="http://acs.amazonaws.com/groups/global/AllUsers"'
    s3_client = boto3.client('s3')
    try:
        s3_client.put_object(Body=object, Bucket=bucket, GrantRead=grant_read, ContentType=content_type, Key=key,
                             Metadata=metadata)
        logging.info(f"Successfully uploaded object to '{bucket}' as '{key}'.")
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [12]:
import json
import boto3  
from botocore.exceptions import ClientError

# s3 = boto3.resource('s3')
# s3object = s3.Object('your-bucket-name', 'your_file.json')

# s3object.put(
#     Body=(bytes(json.dumps(json_data).encode('UTF-8')))
# )

In [6]:
import json
json.dumps(boards_url_list)

'[{"id": "b\'WdfFXZW/dvMLdhtMZs3ihVDCRHI=\'", "ratings": {"Overall Rating": "", "Riding Style": "Freestyle", "Riding Level": "Intermediate - Expert", "Fits Boot size (US)": "8-10", "Manufactured in": "Tunisia by Nidecker", "Shape": "True Twin", "Camber Profile": "Hybrid Camber", "Stance": "Centered", "Approx. Weight": "Feels Normal", "Powder": "Average", "Turning": "Experience", "Carving": "Average", "Speed": "Average", "Uneven": "Terrain", "Switch": "Great", "Jumps": "Good", "Jibbing": "Great", "Pipe": "Good", "On": "Snow", "Turn": "Initiation", "Skidded": "Turns", "Flex": "Medium/Soft", "Buttering": "Easy", "Edge": "Hold"}, "meta_data": {"name": "alloy-b-bomb-gt-2020-snowboard-review", "url": "https://thegoodride.com/snowboard-reviews/alloy-b-bomb-gt-2020-snowboard-review", "price": "$0", "image_url": "https://thegoodride.com/assets/lg-gallery/alloy/snowboards/b-bomb-gt/alloy-b-bomb-gt.jpg"}}, {"id": "b\'WdfFXZW/dvMLdhtMZs3ihVDCRHI=\'", "ratings": {"Overall Rating": "", "Riding Style