In [2]:
import selenium
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager
import requests
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common import service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
#from time import sleep, time
import time
from bs4 import BeautifulSoup as bs
#import pandas as pd
from pathlib import Path
import os
import json
import csv
import uuid
import urllib


'''
This module contains the scraper class and its methods.
'''

class Scraper:
    def __init__(self, url, search_term, headless=False):
        options = Options()
        if headless:
            options.add_argument('--headless')
            self.driver = Chrome(ChromeDriverManager().install(), options=options)
        else:
            self.driver = Chrome(ChromeDriverManager().install())
        self.url = url
        self.search_term = search_term.upper()
        self.driver.get(self.url)
   
    def open_url(self, url):
        self.driver.get(url)
    
    def search(self, name=str):
        search_bar = self.driver.find_element(By.NAME, name)
        search_bar.click()
        search_bar.send_keys(self.search_term)
        search_bar.send_keys(u'\ue007')

    def click_button(self, XPATH):
        button = self.driver.find_element(By.XPATH, XPATH)
        button.click()

    def scroll_up_top(self):
        self.driver.execute_script("window.scrollTo(0,document.body.scrollTop)")

    def scroll_down_bottom(self):
        self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

    def accept_cookies(self, frame_id, XPATH):
        #time.sleep(2)
        try:
            if frame_id!=None:
                self.switch_frame(frame_id)
            else: pass
            self.wait_for(XPATH)
            self.click_button(XPATH)
        except NoSuchElementException:
            pass

    def wait_for(self, XPATH, delay = 10):
        try:    
            WebDriverWait(self.driver, delay).until(EC.presence_of_element_located((By.XPATH, XPATH)))
        except TimeoutException:
            print('Loading took too long. Timeout occurred.')

    def switch_frame(self, frame_id):
        self.wait_for(frame_id)
        self.driver.switchTo().frame(frame_id)

    def quit(self):
        self.driver.quit()

    def next_page(self, url):
        self.open_url(url)

    def see_more(self, XPATH):
        self.scroll_down_bottom()
        self.click_button(XPATH)
        
    def explore_product_ideas(self, XPATH1, XPATH2):
        self.click_button(XPATH1)
        self.click_button(XPATH2)
    
    def infinite_scroll(self):
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            self.scroll_down_bottom()
            time.sleep(3)   
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def get_list_links(self, XPATH_container, XPATH_search_results, delay=10):
        try: 
            self.scroll_down_bottom()
            try:
                self.see_more('//*[@id="search-more"]/a')
                self.infinite_scroll()
                pass
            except NoSuchElementException:
                pass
            container = self.driver.find_element(By.XPATH, XPATH_container)
            search_list = container.find_elements(By.XPATH, XPATH_search_results)

            self.link_list = []

            for result in search_list:
                a_tag = result.find_element(By.TAG_NAME, 'a')
                link = a_tag.get_attribute('href')
                self.link_list.append(link)
            
            # print(self.link_list)
            # print(len(self.link_list))

        except NoSuchElementException:
            print('No results found. Try another search term.')
            pass #don't want it to pass. Want it to start again.
    
    def get_img_links(self, XPATH_main_image, XPATH_thumbnail_container, XPATH_thumbnails):
        self.img_list = []
        try:
            for link in self.link_list:
                self.open_url(link)
                individual_img_list = []
                main_image = self.driver.find_element(By.XPATH, XPATH_main_image)
                img_tag = main_image.find_element(By.TAG_NAME, 'img')
                img_link = img_tag.get_attribute('src')
                individual_img_list.append(img_link)
                thumbnail_container = self.driver.find_element(By.XPATH, XPATH_thumbnail_container)
                thumbnail_list = thumbnail_container.find_elements(By.XPATH, XPATH_thumbnails)
                for thumbnail in thumbnail_list:
                    img_tag = thumbnail.find_element(By.TAG_NAME, 'img')
                    thumbnail_link = img_tag.get_attribute('src')
                    individual_img_list.append(thumbnail_link)
                self.img_list.append(individual_img_list)  
        except NoSuchElementException:
            self.individual_img_list.append('N/A')
            self.img_list.append(self.individual_img_list)
            pass
        #print(self.img_list)
   
    # def get_info(self, XPATH_name, XPATH_date, XPATH_creator, XPATH_supporters, XPATH_days):
    #     self.name_list = []
    #     self.date_list = []
    #     self.creator_list =[]
    #     self.num_supporters_list = []
    #     self.num_days_remaining_list = []
    #     for link in self.link_list:
    #         self.open_url(link)
    #         self.get_details(lst_name = self.name_list, XPATH= XPATH_name)
    #         self.get_details(lst_name = self.date_list, XPATH= XPATH_date)
    #         self.get_details(lst_name = self.creator_list, XPATH= XPATH_creator)
    #         self.get_details(lst_name = self.num_supporters_list, XPATH= XPATH_supporters)
    #         self.get_details(lst_name = self.num_days_remaining_list, XPATH= XPATH_days)
    #         # name = self.driver.find_element(By.XPATH, XPATH_name)
    #         # self.name_list.append(name)
    #         # date = self.driver.find_element(By.XPATH, XPATH_date)
    #         # self.name_list.append(date)
    
    # def get_details(self, XPATH, lst_name):
    #     detail = self.driver.find_element(By.XPATH, XPATH)
    #     lst_name.append(detail)
    
    def create_id(self):
        self.link_id = []
        self.link_uuid = []
        for i in range(len(self.link_list)):
            ID = self.link_list[i][-12:]
            UUID = str(uuid.uuid4())
            self.link_id.append(ID)
            self.link_uuid.append(UUID)
    
    # def collate_info(self):
    #     self.info = {"id": self.link_id,
    #             "uuid": self.link_uuid,
    #             "URL": self.link_list,
    #             "idea_name": self.name_list,
    #             "date": self.date_list,
    #             "creator": self.creator_list,
    #             "number_of_supporters": self.num_supporters_list,
    #             "number_of_days_remaining": self.num_days_remaining_list,
    #             "image_links": self.img_list}
    #     print(self.info) #delete later
    #     return self.info

    def get_html(self, url):
        r = requests.get(url)
        self.soup = bs(r.text, 'html.parser')

    def find_in_html(self, tag, attribute, attribute_name):
        self.soup.find(tag, {attribute: attribute_name}).text

    # def get_info_from_html(self):
    #     self.name_list = []
    #     self.date_list = []
    #     self.creator_list =[]
        
    #     for link in self.link_list:
    #         self.get_html(link)
    #         name = self.soup.find('h1').text
    #         self.name_list.append(name)

    #         date = self.soup.find('span', {"class":"published-date"}).text
    #         self.date_list.append(date)

    #         creator_name = self.soup.find('a', {'data-axl':"alias"}).text
    #         self.creator_list.append(creator_name)

    # def get_info_from_java(self):
    #     self.num_supporters_list = []
    #     self.num_days_remaining_list = []
    #     for link in self.link_list:
    #         self.driver.get(link)
    #         soup = bs(self.driver.page_source, 'html.parser')
    #         numbers = soup.findAll('div', class_= "count")
    #         self.num_supporters_list.append(numbers[0].text)
    #         self.num_days_remaining_list.append(numbers[1].text)

    def download_raw_data(self,path='.'):
        if not os.path.exists(f'{path}/raw_data'):
            os.makedirs(f'{path}/raw_data')
        with open (f'{path}/raw_data/data.json', 'w') as f:
            json.dump(self.info, f, indent="")


    def download_images(self, path='.'):
        if not os.path.exists(f'{path}/{self.search_term}'):
            os.makedirs(f'{path}/{self.search_term}')

        for i, lst in enumerate(self.img_list):
            for j, img in enumerate(lst):
                urllib.request.urlretrieve(img, f'{path}/{self.search_term}/{self.search_term}{i}.{j}.webp')

class LegoScraper(Scraper):
    def get_info_from_java(self):
        self.num_supporters_list = []
        self.num_days_remaining_list = []
        for link in self.link_list:
            self.driver.get(link)
            soup = bs(self.driver.page_source, 'html.parser')
            numbers = soup.findAll('div', class_= "count")
            self.num_supporters_list.append(numbers[0].text)
            self.num_days_remaining_list.append(numbers[1].text)
        print(self.num_supporters_list)
    
    def get_info_from_html(self):
        self.name_list = []
        self.date_list = []
        self.creator_list =[]
        
        for link in self.link_list:
            self.get_html(link)
            name = self.soup.find('h1').text
            self.name_list.append(name)

            date = self.soup.find('span', {"class":"published-date"}).text
            self.date_list.append(date)

            creator_name = self.soup.find('a', {'data-axl':"alias"}).text
            self.creator_list.append(creator_name)

    def collate_info(self):
        self.info = {"id": self.link_id,
                "uuid": self.link_uuid,
                "URL": self.link_list,
                "idea_name": self.name_list,
                "date": self.date_list,
                "creator": self.creator_list,
                "number_of_supporters": self.num_supporters_list,
                "number_of_days_remaining": self.num_days_remaining_list,
                "image_links": self.img_list}
        return self.info



if __name__ == '__main__': 

    def web_scraper():
        #search_term = input('I would like to search for... ')
        search_term = 'violin'
        scraper = LegoScraper('https://ideas.lego.com', search_term)
        try:
            scraper.accept_cookies(frame_id=None, XPATH= '//button[@aria-label="Reject cookies"]')
            #scraper.explore_product_ideas('//a[@class="sub-menu"][1]', '//div[@class="header-link"][1]')
            scraper.search(name='query')
            scraper.get_list_links('//*[@id="search_results"]', './div')
            time.sleep(2)
            # scraper.get_img_links(XPATH_main_image='//div[@class="image-sizing-wrapper"]', XPATH_thumbnail_container='//div[@class="thumbnails-tray"]', XPATH_thumbnails='./div')
            # scraper.create_id()
            #scraper.get_html()
            # scraper.get_info_from_html()
            scraper.get_info_from_java()
            # #scraper.get_info()
            # scraper.collate_info()
            # scraper.download_raw_data()
            #scraper.download_images()
            #scraper.create_uuid()
            # scraper.scroll_down_bottom()
            # time.sleep(2)
            # scraper.see_more('//*[@id="search-more"]/a')
            # #scraper.scroll_up_top()
            # time.sleep(4)
        finally: scraper.quit()


    #web_scraper()





Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [/Users/ESheldon/.wdm/drivers/chromedriver/mac64/100.0.4896.60/chromedriver] found in cache
  self.driver = Chrome(ChromeDriverManager().install())


['\n                338\n            ', '\n                873\n            ', '\n                1,377\n            ', '\n                243\n            ', '\n                1,358\n            ']
