# Importing Libraries, Hardcoding Search Engine Information

## Reverse image search inspiration


Gists taken from : https://gist.github.com/erm3nda/1f3819c71a6ca95d0bb32053536bb9f3

and : https://medium.com/geekculture/scraping-images-using-selenium-f35fab26b122

## Importing Libraries

In [1]:
import os, sys, re, io #handling files
import base64
from tqdm import tqdm

#############################################################################

import requests # HTTP
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import webbrowser 

#############################################################################
import numpy as np # math
import json

#############################################################################
import threading #for multiple browsers at the same time
import logging
import time

#############################################################################
import pdb #because this didnt work on first try

###########################################################
import selenium # Bot

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.webelement import WebElement 
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
###############################################################################
#Image processing for testing 
import PIL
from PIL import Image
##
from pyvirtualdisplay import Display

## Check your Python Version
This code needs the swith-case function available in python 3.10 to handle the differences between the different websites simultaneously.

In [2]:
print(sys.version)

3.10.4 (main, Mar 31 2022, 08:41:55) [GCC 7.5.0]


## Graphic or Headless Options & Starting Virtual Display

In [3]:
display = Display(visible=0, size=(1920, 1080))
display.start()

In [4]:
def SetupOptions():
    #enable-lens-fullscreen-search
    #enable-lens-standalone
    options = Options()
    options.headless=False #True
    options.add_argument("--start-maximized");
    options.add_argument("--disable-dev-shm-usage");
    options.add_argument("--no-experiments");
    options.add_argument("--disable-plugins");
    options.add_argument("--incognito");
    options.add_experimental_option("excludeSwitches", ["enable-automation"]);
    options.add_experimental_option('useAutomationExtension', False)
    
    
    return options

### Testing Script

In [5]:
print(SetupOptions().headless)

False


## SearchEngine Class

In [6]:
class SearchEngine():
    def __init__(self, URL:str,ID:int, thumbnail:str, image:str,load:str, options = SetupOptions()):
        self.URL = URL
        self.ID = ID
        self.thumbnail = thumbnail
        self.image = image
        self.load = load
        self.options = options
        

## Building Google SearchEngine:

In [7]:
Google= SearchEngine(URL="https://images.google.com/searchbyimage/upload",
               ID=0,
               thumbnail=(By.CSS_SELECTOR,"img.Q4LuWd"),
               image = (By.XPATH,"//*[@id=\"Sva75c\"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[3]/div/a/img"),
               load = (By.CSS_SELECTOR,".mye4qd"))

## Building Yandex SearchEngine:

In [8]:
Yandex= SearchEngine(URL="https://yandex.com/images/search",
                     ID=1,
                     thumbnail=(By.XPATH,"//div[@class='serp-item__preview']/a/img"),
                     image = (By.XPATH,"//div[@class='MMImageContainer']/img"),
                     load = "")


## Building Sogou SearchEngine:

In [9]:
Sogou= SearchEngine(URL='https://pic.sogou.com/',
                    ID=2,
                    thumbnail=(By.XPATH,"//div[@class='img-layout']/a/img"),
                    image = (By.XPATH, "//div[@id='imgArea']/div[3]/div/div/a/img"),
                    #image = (By.XPATH, "//*[@id=\"imgArea\"]/div[3]/div/div/a/img"),
                    #image = (By.XPATH,"//div[@class='img-box']/a/img"),
                    load = "")

## Engine_List

In [10]:
Engine_List= []
Engine_List.append(Google)
Engine_List.append(Yandex)
Engine_List.append(Sogou)

for engine in Engine_List:
    print(engine.URL,
      "\n",
      "ID:", engine.ID,
      "\n",
      "Thumbnail:", engine.thumbnail,
      "\n",
      "Image:", engine.image,
      "\n",
      "Load_Button:", engine.load,"\n\n")

https://images.google.com/searchbyimage/upload 
 ID: 0 
 Thumbnail: ('css selector', 'img.Q4LuWd') 
 Image: ('xpath', '//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[3]/div/a/img') 
 Load_Button: ('css selector', '.mye4qd') 


https://yandex.com/images/search 
 ID: 1 
 Thumbnail: ('xpath', "//div[@class='serp-item__preview']/a/img") 
 Image: ('xpath', "//div[@class='MMImageContainer']/img") 
 Load_Button:  


https://pic.sogou.com/ 
 ID: 2 
 Thumbnail: ('xpath', "//div[@class='img-layout']/a/img") 
 Image: ('xpath', "//div[@id='imgArea']/div[3]/div/div/a/img") 
 Load_Button:  




# Browsing using Selenium

## Definiting Basic Functions to Start and Fetch Pages

In [11]:
def StartDriver(Engine:SearchEngine,initial_wait:int=10):
    driverpath='/usr/local/bin/chromedriver/'
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Engine.options)
        driver.get(Engine.URL)
        WebDriverWait(driver,initial_wait);
        print ("Chrome Driver #",Engine.ID," Initialized")
        return driver
        #driver.quit()
        
    except Exception as e:
        print ("FAILURE: Chrome Driver #",Engine.ID,"Not Initialized","Exception:",e)

## Uploading Photographs to Different Websites

### Google

In [12]:
def GoogleUpload(Engine:SearchEngine, img_path:str, wait_button_click:int=5):
    try:
        # This avoids the annoying "Google Lens" Feature (or Bug rather) present in chrome
        multipart = {'encoded_image': (img_path, open(img_path, 'rb')), 'image_content': ''}
        response = requests.post('http://www.google.com/searchbyimage/upload', files=multipart, allow_redirects=False)
        print(response)
        fetchUrl = response.headers['Location']
        Engine.URL=fetchUrl
        driver=StartDriver(Engine)
        driver.find_element(By.XPATH,"//a[contains(.,'similar images')]").click()
        print('ENGINE #',Engine.ID,' Uploaded File, Now Scraping Similar Images...')
    except Exception as e:
        print (e)
        
    return driver

In [13]:
file_path = '/home/cescollino/Documents/ETS/MAITRISE/Dataset/cat.jpg'

### Yandex

In [14]:
def YandexUpload(Engine:SearchEngine, img_path:str, wait_button_click:int=5):
    try:
        driver=StartDriver(Engine)
        driver.find_element(By.XPATH,"/html/body/header/div/div[1]/div[2]/form/div[1]/span/span/button").click()
        driver.find_element(By.XPATH,"//input[@type='file']").send_keys(img_path);
        WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//a[contains(.,'Similar images')]"))
        )
        driver.find_element(By.XPATH,"//a[contains(.,'Similar images')]").click()
        WebDriverWait(driver, 2)
        driver.back()
        driver.forward()
        print('ENGINE #',Engine.ID,' Uploaded File, Now Scraping Similar Images...')
    except Exception as e:
        print (e)
    return driver

### Sogou

In [15]:
def SogouUpload(Engine:SearchEngine, img_path:str, wait_button_click:int=5):
    try:
        driver=StartDriver(Engine)
        driver.find_element(By.XPATH,"//input[@type='file']").send_keys(img_path);
        print('ENGINE #',Engine.ID,' Uploaded File, Now Scraping Similar Images...')
    except Exception as e :
        print(e)
        
    return driver

## Upload depending on ID

In [16]:
def UploadPhoto(Engine:SearchEngine , img_path:str):
    print(Engine.ID)
    match Engine.ID:
        case 0:
            
            return GoogleUpload(Engine=Engine, img_path=img_path)
        
        case 1:
            
            return YandexUpload(Engine=Engine, img_path=img_path)
        
        case 2:
            
            return SogouUpload(Engine=Engine, img_path=img_path)

        # If an exact match is not confirmed, this last case will be used if provided
        case _:
            
            return "Something's wrong"

## Scroll to end

In [17]:
def scroll_to_end(wd,sleep:int=5):
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    wd.implicitly_wait(sleep)
    
                    ##############################################################################################################

## Fetching Image Binaries

In [18]:
def Fetch_Image_Data(actual_image):
    try:
        
        ##FETCHING LINK##
        #Make a request to a web page, and return the status code
        #actual_image is a URL
        image_content = requests.get(actual_image.get_attribute('src')).content
        return image_content
        
    except Exception as e:
        
        print(f"ERROR - Could not download {actual_image.get_attribute('src')} - {e}")
        return e
                    ##############################################################################################################

### Test Script
Dummy=Google
Dummy.URL="https://unsplash.com/photos/37Dw48o0fb4"
driver = StartDriver(Dummy)

element_css = 'div.omfF5:nth-child(3) > div:nth-child(3) > div:nth-child(1) > img:nth-child(1)'
element = driver.find_element(By.CSS_SELECTOR, element_css)

result = Fetch_Image_Data(element)

print(type(result))
result = io.BytesIO(result)
photo = PIL.Image.open(result)
photo.show()

"""
Works
"""

## Saving Image and Link information

In [19]:
def Save_Img(actual_image, target_folder:str,ID:int):
    
    if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
        
        try:
            ##NUMBER##
            n = open(os.path.join(target_folder,'count.txt'),'r')
            counts = n.readlines()
            n.close()
            count = int(counts[ID][:-1])+1
            
            
            ##IMAGE##
            path=os.path.join(target_folder, 'jpg' + "_" + str(count) + "_EngineID_"+ str(ID) +".jpg")
            print(path)
            f = open(path, 'wb')
            f.write(Fetch_Image_Data(actual_image))
            f.close()
            counts[ID]=str(count)+('\n')
            
            n = open(os.path.join(target_folder,'count.txt'),'w')
            n.writelines(counts)
            n.close()
            ##LINK##
            t = open(os.path.join(target_folder, "links_ENGINEID_"+str(ID)+".txt"), 'a', encoding="utf-8")
            t.write(actual_image.get_attribute('src')+'\n')
            t.close()

            print(f"SUCCESS - saved {actual_image.get_attribute('src')} - as {count} from DRIVER ID : {ID}")

        except Exception as e:

            print(f"ERROR - Could not save {actual_image.get_attribute('src')} - {e} DRIVER ID: {ID}")
            
        return 1
    else:
        print("Invalid Image Attribute, cannot save")
        return 0               
                    ##############################################################################################################

## Iterating through all the images, clicking and saving

Note: # "//div[@class='the class I want']/img" or "//a[@class='the class I want']/img" navigates to image inside div or anchor

In [20]:
def FindResults(nav,searchengine):
    scroll_to_end(nav)
    thumbnail_results = nav.find_elements(*searchengine.thumbnail)
    #print(len(thumbnail_results))
    return thumbnail_results

In [21]:
def ClickLoadMore(nav,engine:SearchEngine):
    load_more_button = nav.find_elements(*engine.load)
    
    #if load_more_button:
    #    nav.execute_script("document.querySelector('"+engine.load[1]+"').click();")

## Google Scraping

In [22]:
def GoogleScrape(driver,engine:SearchEngine,count:int,path:str,max_count:int, results_start:int):
    with tqdm(total=max_count) as pbar:
        while count < max_count:

            thumbnails=FindResults(driver,engine)
            number_results=len(thumbnails)
            print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

            for thumb in thumbnails[results_start:number_results]:

                try:
                    thumb.click()
                    image=driver.find_element(*engine.image)
                    time.sleep(1)
                    count=count+Save_Img(actual_image=image,target_folder=path, ID = engine.ID)

                except Exception as e:
                    print(e)

            if count >= max_count:
                    print(f"Found: {count} image links, done!")
                    break

            else:
                    print("Found:",count, "image links, looking for more ...")
                    time.sleep(2)

            scroll_to_end(driver)
            results_start = len(thumbnails)
            pbar.update(count)
    return count

## Yandex Scraping

In [23]:
def YandexScrape(driver,engine:SearchEngine,count:int,path:str,max_count:int, results_start:int):
    with tqdm(total=max_count) as pbar:
        while count < max_count:
            driver.switch_to.window(driver.window_handles[0])
            thumbnails=FindResults(driver,engine)
            number_results=len(thumbnails)
            print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

            for thumb in thumbnails[results_start:number_results]:

                try:
                    driver.switch_to.window(driver.window_handles[0])
                    time.sleep(1)
                    thumb.click()
                    image=driver.find_element(*engine.image)#(By.XPATH,"//div[@class='MMImageContainer']/img")
                    time.sleep(1)
                    count=count+Save_Img(actual_image=image,target_folder=path,ID=engine.ID)
                    driver.back()
                    driver.switch_to.window(driver.window_handles[0])

                except Exception as e:
                    print(e)

            if count >= max_count:
                    print(f"Found: {count} image links, done!")
                    break

            else:
                    print("Found:",count, "image links, looking for more ...")
                    time.sleep(2)

            scroll_to_end(driver)
            results_start = len(thumbnails)
            pbar.update(count)
    return count

## WORKS

## Sogou Scraping

In [24]:
def SogouScrape(driver,engine:SearchEngine,count:int,path:str,max_count:int, results_start:int):
    with tqdm(total=max_count) as pbar:
        while count < max_count:

            thumbnails=FindResults(driver,engine)
            number_results=len(thumbnails)
            print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

            for thumb in thumbnails[results_start:number_results]:

                try:
                    thumb.click()
                    driver.switch_to.window(driver.window_handles[1])
                    image=driver.find_element(*engine.image)
                    time.sleep(1)
                    count=count+Save_Img(actual_image=image,target_folder=path,ID=engine.ID)
                    driver.close()

                except Exception as e:
                    print(e)

                driver.switch_to.window(driver.window_handles[0])

            if count >= max_count:
                    print(f"Found: {count} image links, done!")
                    break

            else:
                    print("Found:",count, "image links, looking for more ...")
                    time.sleep(2)

            scroll_to_end(driver)
            results_start = len(thumbnails)
            pbar.update(count)
    return count


In [25]:
def Scrape(engine:SearchEngine,img_path:str,result_path:str,max_count:int=100):
    driver = UploadPhoto(engine, img_path=img_path)
    count=0
    results_start=0
    print("ID:",engine.ID,"count:",count,"max:", max_count)
    match engine.ID:
        case 0:
            GoogleScrape(driver=driver,
                        engine=engine,
                        count=count,
                        path=result_path,
                        max_count=max_count,
                        results_start=results_start)
        case 1:
            YandexScrape(driver=driver,
                        engine=engine,
                        count=count,
                        path=result_path,
                        max_count=max_count,
                        results_start=results_start)
        case 2:
            SogouScrape(driver=driver,
                        engine=engine,
                        count=count,
                        path=result_path,
                        max_count=max_count,
                        results_start=results_start)
        case other:
            print("ERROR, ENGINE ID MISSING")

## Putting everything together

# Multi-Threading

In [26]:
class myThread (threading.Thread):
   def __init__(self, engine:SearchEngine, img_path:str,result_path:str,max_count:int=100):
      threading.Thread.__init__(self)
      self.engine = engine
      self.img_path = img_path
      self.result_path = result_path
      self.max_count = max_count
   def run(self):
      Scrape(self.engine,self.img_path,self.result_path,max_count=self.max_count)

# Hardcoded File Paths & Run

I keep track of the used prompts under '/home/cescollino/Documents/ETS/MAITRISE/Dataset/results/used_prompts'
results are all placed in '/home/cescollino/Documents/ETS/MAITRISE/Dataset/results/final_scrapes'

In [27]:
file_path = '/home/cescollino/Documents/ETS/MAITRISE/Dataset/cat.jpg'
destination_path = '/home/cescollino/Documents/ETS/MAITRISE/Dataset/results/final_scrapes'
prompts_path = '/home/cescollino/Documents/ETS/MAITRISE/Dataset/results/used_prompts/' 
files = len(os.listdir(prompts_path)) + 1
copy = 'cp ' + file_path + ' ' + prompts_path + 'prompt_' + str(files) +'.jpg'
os.system(copy)

#result_path= '/home/cescollino/Documents/ETS/MAITRISE/Dataset/Scraped_Dataset/cat/'
#result_path = '/home

0

In [28]:
Scrape(Yandex,file_path,destination_path,max_count=1000)

1
Chrome Driver # 1  Initialized
ENGINE # 1  Uploaded File, Now Scraping Similar Images...
ID: 1 count: 0 max: 1000


  0%|          | 0/1000 [00:00<?, ?it/s]

Found: 55 search results. Extracting links from 0:55
Message: stale element reference: element is not attached to the page document
  (Session info: chrome=105.0.5195.125)
Stacktrace:
#0 0x558c3edaa693 <unknown>
#1 0x558c3eba3b0a <unknown>
#2 0x558c3eba6b57 <unknown>
#3 0x558c3eba69ff <unknown>
#4 0x558c3eba6cbc <unknown>
#5 0x558c3ebde26a <unknown>
#6 0x558c3ebd13c0 <unknown>
#7 0x558c3ebf9922 <unknown>
#8 0x558c3ebd0d53 <unknown>
#9 0x558c3ebf9a8e <unknown>
#10 0x558c3ec0d4b0 <unknown>
#11 0x558c3ebf9743 <unknown>
#12 0x558c3ebcf533 <unknown>
#13 0x558c3ebd0715 <unknown>
#14 0x558c3edfa7bd <unknown>
#15 0x558c3edfdbf9 <unknown>
#16 0x558c3eddff2e <unknown>
#17 0x558c3edfe9b3 <unknown>
#18 0x558c3edd3e4f <unknown>
#19 0x558c3ee1dea8 <unknown>
#20 0x558c3ee1e052 <unknown>
#21 0x558c3ee3871f <unknown>
#22 0x7fe245b34b43 <unknown>

Message: stale element reference: element is not attached to the page document
  (Session info: chrome=105.0.5195.125)
Stacktrace:
#0 0x558c3edaa693 <unknown>

  0%|          | 0/1000 [00:58<?, ?it/s]

Found: 110 search results. Extracting links from 55:110
Message: element click intercepted: Element <img class="serp-item__thumb justifier__thumb" src="//avatars.mds.yandex.net/i?id=6449b1b416cb31fe502b1d0da8f21925-5481674-images-thumbs&amp;n=13" data-error-handler="serpItemError" alt="Klik untuk donasi - Bantu Nouren Hadapi Respiratory Failure Yuk " style="height: 167.5px; width: 297.8px;"> is not clickable at point (797, 84). Other element would receive the click: <div class="tabs-navigation__content">...</div>
  (Session info: chrome=105.0.5195.125)
Stacktrace:
#0 0x558c3edaa693 <unknown>
#1 0x558c3eba3b0a <unknown>
#2 0x558c3ebe3f52 <unknown>
#3 0x558c3ebe19a0 <unknown>
#4 0x558c3ebdef74 <unknown>
#5 0x558c3ebddc97 <unknown>
#6 0x558c3ebd1551 <unknown>
#7 0x558c3ebf9922 <unknown>
#8 0x558c3ebd0d53 <unknown>
#9 0x558c3ebf9a8e <unknown>
#10 0x558c3ec0d4b0 <unknown>
#11 0x558c3ebf9743 <unknown>
#12 0x558c3ebcf533 <unknown>
#13 0x558c3ebd0715 <unknown>
#14 0x558c3edfa7bd <unknown>
#15 

  0%|          | 0/1000 [01:45<?, ?it/s]


KeyboardInterrupt: 