In [17]:
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 18 13:01:02 2020

@author: OHyic
"""
#import selenium drivers
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, WebDriverException, SessionNotCreatedException   
from webdriver_manager.chrome import ChromeDriverManager 

#import helper libraries
import time
import urllib.request
import os
import requests
import io
from PIL import Image

#custom patch libraries
from selenium.webdriver.common.keys import Keys
import sys
import re
import zipfile
import stat
from sys import platform

def download_lastest_chromedriver(current_chrome_version=""):
    def get_platform_filename():
        filename = ''
        is_64bits = sys.maxsize > 2**32
    
        if platform == "linux" or platform == "linux2":
            # linux
            filename += 'linux'
            filename += '64' if is_64bits else '32'
        elif platform == "darwin":
            # OS X
            filename += 'mac64'
        elif platform == "win32":
            # Windows...
            filename += 'win32'
    
        filename += '.zip'
    
        return filename
    
    # Find the latest chromedriver, download, unzip, set permissions to executable.
    
    result = False
    try:
        url = 'https://chromedriver.chromium.org/downloads'
        base_driver_url = 'https://chromedriver.storage.googleapis.com/'
        file_name = 'chromedriver_' + get_platform_filename()
        pattern = 'https://.*?path=(\d+\.\d+\.\d+\.\d+)'
    
        # Download latest chromedriver.
        stream = urllib.request.urlopen(url)
        content = stream.read().decode('utf8')
    
        # Parse the latest version.
        all_match = re.findall(pattern, content)
        
        if all_match:
            # Version of latest driver.
            if(current_chrome_version!=""):
                print("[INFO] updating chromedriver")
                all_match = list(set(re.findall(pattern, content)))
                current_chrome_version = ".".join(current_chrome_version.split(".")[:-1])
                version_match = [i for i in all_match if re.search("^%s"%current_chrome_version,i)]
                version = version_match[0]
            else:
                print("[INFO] installing new chromedriver")
                version = all_match[1]
            driver_url = base_driver_url + version + '/' + file_name
    
            # Download the file.
            print('[INFO] downloading chromedriver ver: %s: %s'% (version, driver_url))
            app_path = os.path.dirname(os.path.realpath(__file__))
            chromedriver_path = os.path.normpath(app_path+"\\webdriver\\chromedriver.exe")
            file_path = os.path.normpath(app_path + '\\webdriver\\' + file_name)
            urllib.request.urlretrieve(driver_url, file_path)
    
            # Unzip the file into folder
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(os.path.normpath(app_path + '\\webdriver\\'))
    
            st = os.stat(chromedriver_path)
            os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
            print('[INFO] lastest chromedriver downloaded')
            # Cleanup.
            os.remove(file_path)
            result = True
    except Exception:
        print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
    
    return result


class GoogleImageScraper():
    def __init__(self,image_path, search_key="cat",number_of_images=1,headless=False,min_resolution=(0,0),max_resolution=(1920,1080)):
        #check parameter types
        image_path += "\\"+search_key
        if (type(number_of_images)!=int):
            print("[Error] Number of images must be integer value.")
            return
        if not os.path.exists(image_path):
            print("[INFO] Image path not found. Creating a new folder.")
            os.makedirs(image_path)
        #check if chromedriver is updated
        #try going to www.google.com
        options = Options()
        if(headless):
            options.add_argument('--headless')
        driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
        driver.set_window_size(1400,1050)
        driver.get("https://www.google.com")
            
        self.driver = driver
        self.search_key = search_key
        self.number_of_images = number_of_images
        self.image_path = image_path
        self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
        self.headless=headless
        self.min_resolution = min_resolution
        self.max_resolution = max_resolution
        
    def find_image_urls(self):
        """
            This function search and return a list of image urls based on the search key.
            Example:
                google_image_scraper = GoogleImageScraper("image_path","search_key",number_of_photos)
                image_urls = google_image_scraper.find_image_urls()
                
        """
        print("[INFO] Scraping for image link... Please wait.")
        image_urls=[]
        count = 0
        missed_count = 0
        self.driver.get(self.url)
        time.sleep(5)
        indx = 1
        while self.number_of_images >= count:
            try:
                #find and click image
                imgurl = self.driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(indx)))
                imgurl.click()
                missed_count = 0 
            except Exception:
                #print("[-] Unable to click this photo.")
                missed_count = missed_count + 1
                if (missed_count>10):
                    print("[INFO] No more photos.")
                    break
                 
            try:
                #select image from the popup
                time.sleep(1)
                class_names = ["n3VNCb"]
                images = [self.driver.find_elements_by_class_name(class_name) for class_name in class_names if len(self.driver.find_elements_by_class_name(class_name)) != 0 ][0]
                for image in images:
                    #only download images that starts with http
                    if(image.get_attribute("src")[:4].lower() in ["http"]):
                        print("[INFO] %d. %s"%(count,image.get_attribute("src")))
                        image_urls.append(image.get_attribute("src"))
                        count +=1
                        break
            except Exception:
                print("[INFO] Unable to get link")   
                
            try:
                #scroll page to load next image
                if(count%3==0):
                    self.driver.execute_script("window.scrollTo(0, "+str(indx*60)+");")
                element = self.driver.find_element_by_class_name("mye4qd")
                element.click()
                print("[INFO] Loading more photos")
                time.sleep(5)
            except Exception:  
                time.sleep(1)
            indx += 1

        
        self.driver.quit()
        print("[INFO] Google search ended")
        return image_urls

    def save_images(self,image_urls):
        #save images into file directory
        """
            This function takes in an array of image urls and save it into the prescribed image path/directory.
            Example:
                google_image_scraper = GoogleImageScraper("image_path","search_key",number_of_photos)
                image_urls=["https://example_1.jpg","https://example_2.jpg"]
                google_image_scraper.save_images(image_urls)
                
        """
        print("[INFO] Saving Image... Please wait...")
        for indx,image_url in enumerate(image_urls):
            try:
                print("[INFO] Image url:%s"%(image_url))
                search_string = ''.join(e for e in self.search_key if e.isalnum())
                image = requests.get(image_url,timeout=5)
                if image.status_code == 200:
                    with Image.open(io.BytesIO(image.content)) as image_from_web:
                        try:
                            filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower())
                            file_name, file_extension = os.path.splitext(filename)
                            if file_extension not in [".jpeg", ".png",".jpg"]:
                                if file_extension == ".webp":
                                    filename = file_name + ".jpeg"
                                else :
                                    print(filename, " not in allowed formats")
                                    continue
                            image_path = os.path.join(self.image_path, filename)
                            print("[INFO] %d .Image saved at: %s"%(indx,image_path))
                            image_from_web.save(image_path)
                        except OSError:
                            rgb_im = image_from_web.convert('RGB')
                            rgb_im.save(image_path)
                        image_resolution = image_from_web.size
                        if image_resolution != None:
                            if image_resolution[0]<self.min_resolution[0] or image_resolution[1]<self.min_resolution[1] or image_resolution[0]>self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]:
                                image_from_web.close()
                                #print("GoogleImageScraper Notification: %s did not meet resolution requirements."%(image_url))
                                os.remove(image_path)

                        image_from_web.close()
            except Exception as e:
                print("[ERROR] Failed to be downloaded",e)
                pass
        print("[INFO] Download Completed. Please note that some photos are not downloaded as it is not in the right format (e.g. jpg, jpeg, png)")


In [18]:
#Define file path (Don't change)
CH_DIR = 'C:\\Users\\avuat\\Python Projects\\Motorcycle Classifier'

image_path = str(CH_DIR+"\\photos")

#Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"]
search_keys= ["yamaha xsr 700","honda cb 500 1975"]

#Parameters
number_of_images = 10
headless = True
min_resolution=(0,0)
max_resolution=(1920,1080)

#Main program
for search_key in search_keys:
    image_path += "\\"+search_key
    image_scrapper = GoogleImageScraper(image_path,search_key,number_of_images,headless,min_resolution,max_resolution)
    image_urls = image_scrapper.find_image_urls()
    image_scrapper.save_images(image_urls)



[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\avuat\.wdm\drivers\chromedriver\win32\95.0.4638.54\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)


[INFO] Scraping for image link... Please wait.


  imgurl = self.driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(indx)))
  images = [self.driver.find_elements_by_class_name(class_name) for class_name in class_names if len(self.driver.find_elements_by_class_name(class_name)) != 0 ][0]
  element = self.driver.find_element_by_class_name("mye4qd")


[INFO] 0. https://www.audemar.com/18223-large_default/xsr700.jpg
[INFO] 1. https://static.s2-concept.com/3136-medium_default/xsr-700-vintage-17166.jpg
[INFO] 2. https://www.motoplanete.com/yamaha/zoom-700px/Yamaha-XSR-700-2021-700px.webp
[INFO] 3. https://www.planete-yam.com/1427707-large_default/yamaha-xsr-700.jpg
[INFO] 4. https://m.media-amazon.com/images/I/8182kdr2GNL._AC_SX679_.jpg
[INFO] 5. https://www.equipmoto.fr/301688-large_default/ligne-complete-d-echappement-ixrace-mk2-yamaha-xsr-700-2017-2020.jpg
[INFO] 6. https://static.s2-concept.com/3135-large_default/xsr-700-vintage-17166.jpg
[INFO] 7. https://moto-station.com/wp-content/uploads/2018/02/Yamaha_XSR700_st2pz-1.jpg
[INFO] 8. https://www.streetmotorbike.com/media/catalog/product/cache/1/image/9df78eab33525d08d6e5fb8d27136e95/s/i/silencieux-ligne-compl_te-moto-mivv-ghibli-s-black-yamaha-xsr-700.jpg
[INFO] 9. https://lh3.googleusercontent.com/proxy/kyHzr3PlyOoH0hu0CA3vmKOZbk8GBCqjMdtg2XtiYN96iLlparO_sLUDeQGNk4ZndvZHNt0UsL63n

KeyboardInterrupt: 