In [12]:
import selenium
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager
#import requests
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common import service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
#from time import sleep, time
import time
from bs4 import BeautifulSoup as bs
#import pandas as pd
from pathlib import Path
import os
import csv
import uuid
import urllib


'''
This module contains the scraper class and its methods.
'''

class Scraper:
    def __init__(self, url, search_term, headless=False):
        options = Options()
        if headless:
            options.add_argument('--headless')
            self.driver = Chrome(ChromeDriverManager().install(), options=options)
        else:
            self.driver = Chrome(ChromeDriverManager().install())
        self.url = url
        self.search_term = search_term.upper()
        self.driver.get(self.url)
   
    def open_url(self, url):
        self.driver.get(url)
    
    def search(self, name=str):
        search_bar = self.driver.find_element(By.NAME, name)
        search_bar.click()
        search_bar.send_keys(self.search_term)
        search_bar.send_keys(u'\ue007')

    def click_button(self, XPATH):
        button = self.driver.find_element(By.XPATH, XPATH)
        button.click()

    def scroll_up_top(self):
        self.driver.execute_script("window.scrollTo(0,document.body.scrollTop)")

    def scroll_down_bottom(self):
        self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

    def accept_cookies(self, frame_id, XPATH):
        #time.sleep(2)
        try:
            if frame_id!=None:
                self.switch_frame(frame_id)
            else: pass
            self.wait_for(XPATH)
            self.click_button(XPATH)
        except NoSuchElementException:
            pass

    def wait_for(self, XPATH, delay = 10):
        try:    
            WebDriverWait(self.driver, delay).until(EC.presence_of_element_located((By.XPATH, XPATH)))
        except TimeoutException:
            print('Loading took too long. Timeout occurred.')

    def switch_frame(self, frame_id):
        self.wait_for(frame_id)
        self.driver.switchTo().frame(frame_id)

    def quit(self):
        self.driver.quit()

    def next_page(self, url):
        self.open_url(url)

    def see_more(self, XPATH):
        self.scroll_down_bottom()
        self.click_button(XPATH)
        
    def explore_product_ideas(self, XPATH1, XPATH2):
        self.click_button(XPATH1)
        self.click_button(XPATH2)
    
    def infinite_scroll(self):
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            self.scroll_down_bottom()
            time.sleep(3)   
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def get_list_links(self, XPATH_container, XPATH_search_results, delay=10):
        try: 
            self.scroll_down_bottom()
            try:
                self.see_more('//*[@id="search-more"]/a')
                self.infinite_scroll()
                pass
            except NoSuchElementException:
                pass
            container = self.driver.find_element(By.XPATH, XPATH_container)
            search_list = container.find_elements(By.XPATH, XPATH_search_results)

            self.link_list = []

            for result in search_list:
                a_tag = result.find_element(By.TAG_NAME, 'a')
                link = a_tag.get_attribute('href')
                self.link_list.append(link)
            
            print(self.link_list)
            print(len(self.link_list))

        except NoSuchElementException:
            print('No results found. Try another search term.')
            pass #don't want it to pass. Want it to start again.
    
    def get_img_links(self, XPATH_main_image, XPATH_thumbnail_container, XPATH_thumbnails):
        self.img_list = []
        try:
            for link in self.link_list:
                self.open_url(link)
                self.individual_img_list = []
                main_image = self.driver.find_element(By.XPATH, XPATH_main_image)
                img_tag = main_image.find_element(By.TAG_NAME, 'img')
                img_link = img_tag.get_attribute('src')
                self.individual_img_list.append(img_link)
                thumbnail_container = self.driver.find_element(By.XPATH, XPATH_thumbnail_container)
                thumbnail_list = thumbnail_container.find_elements(By.XPATH, XPATH_thumbnails)
                for thumbnail in thumbnail_list:
                    img_tag = thumbnail.find_element(By.TAG_NAME, 'img')
                    thumbnail_link = img_tag.get_attribute('src')
                    self.individual_img_list.append(thumbnail_link)
                self.img_list.append(self.individual_img_list)  
        except NoSuchElementException:
            self.individual_img_list.append('N/A')
            self.img_list.append(self.individual_img_list)
            pass
        print(self.img_list)

    def create_id(self):
        self.link_id = []
        self.link_uuid = []
        for i in range(len(self.link_list)):
            ID = self.link_list[i][-12:]
            UUID = uuid.uuid4()
            self.link_id.append(ID)
            self.link_uuid.append(UUID)
    
    def collate_info(self):
        self.info = {"id": self.link_id,
                "uuid": self.link_uuid,
                "URL": self.link_list}
        print(self.info)
        return self.info

    def download_images(self, path='.'):
        if not os.path.exists(f'{path}/{self.search_term}'):
            os.makedirs(f'{path}/{self.search_term}')
        
        for i, img in enumerate(self.img_list):
            urllib.request.urlretrieve(img, f'{path}/{self.search_term}/{self.search_term}{i}.png')



if __name__ == '__main__': 

    def web_scraper():
        search_term = input('I would like to search for... ')
        scraper = Scraper('https://ideas.lego.com', search_term)
        try:
            scraper.accept_cookies(frame_id=None, XPATH= '//button[@aria-label="Reject cookies"]')
            #scraper.explore_product_ideas('//a[@class="sub-menu"][1]', '//div[@class="header-link"][1]')
            scraper.search(name='query')
            scraper.get_list_links('//*[@id="search_results"]', './div')
            time.sleep(2)
            scraper.get_img_links(XPATH_main_image='//div[@class="image-sizing-wrapper"]', XPATH_thumbnail_container='//div[@class="thumbnails-tray"]', XPATH_thumbnails='./div')
            # scraper.create_id()
            # scraper.collate_info()
            #scraper.create_uuid()
            # scraper.scroll_down_bottom()
            # time.sleep(2)
            # scraper.see_more('//*[@id="search-more"]/a')
            # #scraper.scroll_up_top()
            # time.sleep(4)
        finally: scraper.quit()


    web_scraper()





Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [/Users/ESheldon/.wdm/drivers/chromedriver/mac64/100.0.4896.60/chromedriver] found in cache
  self.driver = Chrome(ChromeDriverManager().install())


['https://ideas.lego.com/projects/9193e8c1-0546-4e73-99a2-2f89c5e2ddd3', 'https://ideas.lego.com/projects/4fab9083-ec71-46bf-80d9-3d5129626a93', 'https://ideas.lego.com/projects/8529a4b5-f36a-4779-943e-55c196e772e8', 'https://ideas.lego.com/projects/aa4cfa30-e9a2-418d-a1a0-79638e50a54f', 'https://ideas.lego.com/projects/1ef44bb0-7e5a-4484-94b6-2fee44fe3dc6']
5
[['https://ideascdn.lego.com/media/generate/lego_ci/5bd2c0e4-5612-414d-9d13-69a88a25dfcf/resize:950:633/webp', 'https://ideascdn.lego.com/media/generate/lego_ci/5bd2c0e4-5612-414d-9d13-69a88a25dfcf/resize:128:85/webp', 'https://ideascdn.lego.com/media/generate/lego_ci/5d460562-b2d6-433e-9da3-78cfcf3a8f73/resize:128:85/webp', 'https://ideascdn.lego.com/media/generate/lego_ci/f542a1da-565b-42d6-beee-27356daed1df/resize:128:85/webp'], ['https://ideascdn.lego.com/media/generate/lego_ci/2ffa0dce-e1e5-4a93-997d-d0f5ef238435/resize:950:633/webp', 'https://ideascdn.lego.com/media/generate/lego_ci/2ffa0dce-e1e5-4a93-997d-d0f5ef238435/resi