# Imports

In [None]:
import flickrapi
import os
import urllib.request
import sys
import re
import zipfile
import stat
import time
import requests
import io
from pathlib import Path
from jmd_imagescraper.core import *
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
from selenium.common.exceptions import NoSuchElementException
from sys import platform
from urllib.parse import urlparse
from PIL import Image

# Main variables

### Flickr API

In [None]:
flickr_api = 'YOUR_API_KEY'

### Variables

In [None]:
queries = ['cat', 'boy', 'awesome image']
out_path = './out'
max_num = 5000

# Flickr Parser

In [None]:
def flickr_parse(flickr_api, query, max_num, out_path):
    query_out_path = out_path + f'/{query}'
    if not os.path.isdir(query_out_path):
        os.mkdir(query_out_path)
    flickr_api = flickr_api.split(':')
    try:
        flickr = flickrapi.FlickrAPI(flickr_api[0], flickr_api[1], cache=True)
    except:
        print('Your API key is invalid')
        exit()
    try:
        photos = flickr.walk(text=query, tag_mode='all', tags=query, extras='url_c', per_page=1, sort='relevance')
        counter = 0
        for photo in photos:
            try:
                url = photo.get('url_c')
                urllib.request.urlretrieve(url, f'{query_out_path}/{counter}.jpg')
                counter += 1
            except:
                pass
            if counter > max_num - 1:
                break
        print(f'{counter} Images downloaded from Flickr to {query_out_path}')
    except:
        print(f'Cannot parse {query} from Flickr')

# DuckDuckGo Parser

In [None]:
def duckduckgo_parser(query, max_num, out_path):
    duckduckgo_search(out_path, query, query, max_results=max_num)

# Google Parser

### Scary things

In [None]:
def download_lastest_chromedriver(current_chrome_version=""):
    def get_platform_filename():
        filename = ''
        is_64bits = sys.maxsize > 2**32
        if platform == "linux" or platform == "linux2":
            filename += 'linux'
            filename += '64' if is_64bits else '32'
        elif platform == "darwin":
            filename += 'mac64'
        elif platform == "win32":
            filename += 'win32'
        filename += '.zip'
        return filename
    
    result = False
    
    try:
        url = 'https://chromedriver.chromium.org/downloads'
        base_driver_url = 'https://chromedriver.storage.googleapis.com/'
        file_name = 'chromedriver_' + get_platform_filename()
        pattern = 'https://.*?path=(\d+\.\d+\.\d+\.\d+)'
        stream = urllib.request.urlopen(url)
        content = stream.read().decode('utf8')
        all_match = re.findall(pattern, content)
        if all_match:
            if(current_chrome_version!=""):
                print("[INFO] updating chromedriver")
                all_match = list(set(re.findall(pattern, content)))
                current_chrome_version = ".".join(current_chrome_version.split(".")[:-1])
                version_match = [i for i in all_match if re.search("^%s"%current_chrome_version,i)]
                version = version_match[0]
            else:
                print("[INFO] installing new chromedriver")
                version = all_match[1]
            driver_url = base_driver_url + version + '/' + file_name
            print('[INFO] downloading chromedriver ver: %s: %s'% (version, driver_url))
            app_path = os.path.dirname(os.path.realpath(__file__))
            
            chromedriver_path = os.path.normpath('C:/Software/Programming/chromedriver/chromedriver.exe')
            
            file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name))
            urllib.request.urlretrieve(driver_url, file_path)
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(os.path.normpath(os.path.join(app_path, 'webdriver')))
            st = os.stat(chromedriver_path)
            os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
            print('[INFO] lastest chromedriver downloaded')
            os.remove(file_path)
            result = True
    
    except Exception:
        print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
    
    return result

In [None]:
class GoogleImageScraper():
    def __init__(self, webdriver_path, image_path, search_key="image", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10):
        image_path = os.path.join(image_path, search_key)
        if (type(number_of_images)!=int):
            print("[Error] Number of images must be integer value.")
            return
        if not os.path.exists(image_path):
            print("[INFO] Image path not found. Creating a new folder.")
            os.makedirs(image_path)
        while(True):
            options = Options()
            if(headless):
                options.add_argument('--headless')
            driver = webdriver.Chrome(webdriver_path, chrome_options=options)
            driver.set_window_size(1400,1050)
            driver.get("https://www.google.com")
            if driver.find_elements_by_id("L2AGLb"):
                driver.find_element_by_id("L2AGLb").click()
            break
        self.driver = driver
        self.search_key = search_key
        self.number_of_images = number_of_images
        self.webdriver_path = webdriver_path
        self.image_path = image_path
        self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
        self.headless=headless
        self.min_resolution = min_resolution
        self.max_resolution = max_resolution
        self.max_missed = max_missed

    def find_image_urls(self):
        print("[INFO] Gathering image links")
        image_urls=[]
        count = 0
        missed_count = 0
        self.driver.get(self.url)
        time.sleep(3)
        indx = 1
        while self.number_of_images > count:
            try:
                imgurl = self.driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(indx)))
                imgurl.click()
                missed_count = 0
            except Exception:
                missed_count = missed_count + 1
                if (missed_count>self.max_missed):
                    print("[INFO] Maximum missed photos reached, exiting...")
                    break

            try:
                time.sleep(1)
                class_names = ["n3VNCb"]
                images = [self.driver.find_elements_by_class_name(class_name) for class_name in class_names if len(self.driver.find_elements_by_class_name(class_name)) != 0 ][0]
                for image in images:
                    src_link = image.get_attribute("src")
                    if(("http" in  src_link) and (not "encrypted" in src_link)):
                        print(
                            f"[INFO] {self.search_key} \t #{count} \t {src_link}")
                        image_urls.append(src_link)
                        count +=1
                        break
            except Exception:
                print("[INFO] Unable to get link")

            try:
                if(count%3==0):
                    self.driver.execute_script("window.scrollTo(0, "+str(indx*60)+");")
                element = self.driver.find_element_by_class_name("mye4qd")
                element.click()
                print("[INFO] Loading next page")
                time.sleep(3)
            except Exception:
                time.sleep(1)
            indx += 1


        self.driver.quit()
        print("[INFO] Google search ended")
        return image_urls

    def save_images(self,image_urls, keep_filenames=True):
        print(keep_filenames)
        print("[INFO] Saving image, please wait...")
        for indx,image_url in enumerate(image_urls):
            try:
                print("[INFO] Image url:%s"%(image_url))
                search_string = ''.join(e for e in self.search_key if e.isalnum())
                image = requests.get(image_url,timeout=5)
                if image.status_code == 200:
                    with Image.open(io.BytesIO(image.content)) as image_from_web:
                        try:
                            if (keep_filenames):
                                o = urlparse(image_url)
                                image_url = o.scheme + "://" + o.netloc + o.path
                                name = os.path.splitext(os.path.basename(image_url))[0]
                                filename = "%s.%s"%(name,image_from_web.format.lower())
                            else:
                                filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower())

                            image_path = os.path.join(self.image_path, filename)
                            print(
                                f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}")
                            image_from_web.save(image_path)
                        except OSError:
                            rgb_im = image_from_web.convert('RGB')
                            rgb_im.save(image_path)
                        image_resolution = image_from_web.size
                        if image_resolution != None:
                            if image_resolution[0]<self.min_resolution[0] or image_resolution[1]<self.min_resolution[1] or image_resolution[0]>self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]:
                                image_from_web.close()
                                os.remove(image_path)

                        image_from_web.close()
            except Exception as e:
                print("[ERROR] Download failed: ",e)
                pass
        print("--------------------------------------------------")
        print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)")

### Parse

In [None]:
def google_parser(query, max_num, out_path):
    if not os.path.isdir(out_path):
        os.mkdir(out_path)
    
    webdriver_path = os.path.normpath('PATH_TO_chromedriver.exe')
    
    image_scraper = GoogleImageScraper(
        webdriver_path, out_path, query, max_num, True, (0, 0), (9999, 9999))
    image_urls = image_scraper.find_image_urls()
    image_scraper.save_images(image_urls)
    del image_scraper

In [None]:
for query in queries:
    flickr_parse(flickr_api, query, max_num, out_path)
    duckduckgo_parser(query, max_num, out_path)
    google_parser(query, max_num, out_path)