<a href="https://colab.research.google.com/github/BrodySpearman/Python-image-scraping-methods/blob/main/python_image_scraper_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Image scraper with Python**

# ***Method one: BeautifulSoup with requests to collect html metadata.***

In [None]:
!pip install requests
!pip install bs4
!pip install ipyplot

In [None]:
from bs4 import BeautifulSoup as soup
import requests

# parameters
url = 'https://www.google.com/search?q=abstract+art&client=opera-gx&hs=rdy&sxsrf=ALeKk02DmSL4rU1lcwug_EMwN5Sodd4uHQ:1625774758469&source=lnms&tbm=isch&sa=X&ved=2ahUKEwjo6dn3otTxAhV8Ap0JHVgLDMIQ_AUoAXoECAEQAw'

def get_image_data(url):
  r = requests.get(url)
  return r.text

htmldata = get_image_data(url)
raw_images = soup(htmldata,'html.parser')

for item in raw_images.find_all('img'):
  print(item['src'])

# Pros: 


*   Very readable
*   Very simple
*   Modular

---

# Cons:


*   Can get messy quick.
*   Can only draw off url, rather than key lookup searches. With webdriver however this functionality could be possible.
*   Limited: can't scroll, problems when encountering infinite webpages.







# ***Method Two: Selenium combined with Chrome webdriver.***

In [None]:
!pip install selenium
!apt-get update 
!apt install chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
from random import randint
from google.colab.patches import cv2_imshow
from IPython.display import display, Image
import ipyplot

from google.colab import drive
drive.mount('/content/drive')

In [7]:
# Parameters for scraping
query = 'abstract'
num_of_images = 50

def get_images(query, num_of_images):

  # Needed to work on a jupyter notebook, otherwise would just need to specify a local drive location.
  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
  driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
  driver.get('https://www.google.com/imghp?hl=en&authuser=0&ogbl') # driver directed towards google images

  # xpath to the html element corrosponding to search bar
  box = driver.find_element_by_xpath('//*[@id="sbtc"]/div/div[2]/input')
  box.send_keys(query) # inputs query into search bar
  box.send_keys(Keys.ENTER)

  def auto_scroll():
    scroll_height = 'return document.body.scrollHeight'
    last_height = driver.execute_script(scroll_height)

    while True:
      driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
      time.sleep(2)
      new_height = driver.execute_script(scroll_height) # Scrolls to bottom of page and resets scroll height
      try:
        driver.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input').click()
        time.sleep(2)
      except:
        pass
      if new_height == last_height:
        break
      last_height = new_height

  auto_scroll()

  test_dir_name = f'test_{str(randint(10000,100000))}' 
  test_dir_path = f'/content/drive/My Drive/scrape_test/{test_dir_name}/'
  test_folder = os.mkdir(test_dir_path) # creates a test directory to store data

  print('finding images...')
  for i in range(1, num_of_images):
    image_name = f'testImage ({str(i)}).png'
    image_path = f'/content/drive/My Drive/scrape_test/{test_dir_name}/{image_name}'
    try:
      image_location = driver.find_element_by_xpath(f'//*[@id="islrg"]/div[1]/div[{str(i)}]/a[1]/div[1]/img')
      image = image_location.screenshot(image_path)

      
    except:
      pass

  print('content downloaded!')

get_images(query, num_of_images)

  if sys.path[0] == '':
  del sys.path[0]


finding images...
content downloaded!


# Pros:

*   Easy integration and downloads to google drive.
*   Can download large amounts of images at a time, great for dataset building.
*   Scalable.
*   Specifying html data by xpath makes it easy to modify.


---



# Cons:

*   *Very* messy right now. Images are low resolution and many images contain other residual google elements and white spaces. 
*   A tad more complicated than method one.


# ***Cropping white space out of uncleaned dataset***


In [None]:
# Our dataset is a little chaotic, due to the screenshot nature. We need to iterate through each image
# in a dataset directory and clean it out by finding where the edges of images are and doing some cropping work.

