In [1]:
######  https://github.com/joedockrill/image-scraper

#@title RUN THIS CELL for code setup.
#@markdown If you're new to Colab and you want to see the code, you can select this cell, 
#@markdown click the ... menu in the top right of the cell then click Form->Hide Form

from pathlib import Path
import shutil
import requests
import re
import json
import time
from bs4 import BeautifulSoup
from PIL import Image as PImage
from PIL import ImageDraw as PImageDraw
import ipywidgets as widgets
from IPython.display import display
from google.colab import files
from google.colab import drive
from typing import Callable
from enum import Enum
import pandas as pd


### Steps to use it

#.1. import everything
#.2. run the SET-UP code
#.3. Build the search, Build your download
#.4. Zip your download
#.5. Download your file

In [2]:
#  SET-UP Code
BASE_FOLDER = "images"

##########################################################################################
# scraping
##########################################################################################
def google_scrape_urls(keywords: str, max_results: int) -> list:
  '''scrape urls from google image search'''
  BASE_URL = "https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&q="

  HEADERS = {
      'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive',
  }
  
  searchurl = BASE_URL + keywords
  resp = requests.get(searchurl, headers=HEADERS)
  html = resp.text
  
  soup = BeautifulSoup(html, "html.parser")
  results = soup.findAll("img", {"data-src":True}, limit=max_results)
  
  links = []
  for re in results:
    links.append(re["data-src"])

  return links  

class ImgSize(Enum):
  Thumbs=""
  Small="Small"
  Medium="Medium"
  Large="Large"
  Wallpaper="Wallpaper"

class ImgType(Enum):
  All=""
  Photo="photo"
  Clipart="clipart"
  Gif="gif"
  Transparent="transparent"

class ImgLayout(Enum):
  All=""
  Square="Square"
  Tall="Tall"
  Wide="Wide"
  
class ImgColor(Enum):
  All=""
  Color="color"
  Monochrome="Monochrome"
  Red="Red"
  Orange="Orange"
  Yellow="Yellow"
  Green="Green"
  Blue="Blue"
  Purple="Purple"
  Pink="Pink" 
  Brown="Brown"
  Black="Black" 
  Gray="Gray" 
  Teal="Teal"
  White="White"

def duckduckgo_scrape_urls(keywords: str, max_results: int, 
                           img_size: ImgSize=ImgSize.Thumbs, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All) -> list:
  '''scrape urls from duckduckgo image search'''
  BASE_URL = 'https://duckduckgo.com/'
  params = {
    'q': keywords
  };
  results = 0
  links = []

  resp = requests.post(BASE_URL, data=params)
  match = re.search(r'vqd=([\d-]+)\&', resp.text, re.M|re.I)
  assert match is not None, "Failed to obtain search token"

  HEADERS = {
      'authority': 'duckduckgo.com',
      'accept': 'application/json, text/javascript, */*; q=0.01',
      'sec-fetch-dest': 'empty',
      'x-requested-with': 'XMLHttpRequest',
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
      'sec-fetch-site': 'same-origin',
      'sec-fetch-mode': 'cors',
      'referer': 'https://duckduckgo.com/',
      'accept-language': 'en-US,en;q=0.9',
  }

  constraints = ""
  if(img_size != ImgSize.Thumbs): constraints +=  "size:" + img_size.name
  constraints += ","
  if(img_type != ImgType.All): constraints +=  "type:" + img_type.name
  constraints += ","
  if(img_layout != ImgLayout.All): constraints +=  "layout:" + img_layout.name
  constraints += ","
  if(img_color != ImgColor.All): constraints +=  "color:" + img_color.name
  
  PARAMS = (
      ('l', 'us-en'),
      ('o', 'json'),
      ('q', keywords),
      ('vqd', match.group(1)),
      ('f', constraints),
      ('p', '1'),
      ('v7exp', 'a'),
  )

  requestUrl = BASE_URL + "i.js"

  while True:
      while True:
          try:
              resp = requests.get(requestUrl, headers=HEADERS, params=PARAMS)
              data = json.loads(resp.text)
              break
          except ValueError as e:
              print("Hit request throttle, sleeping and retrying")
              time.sleep(5); #seems a lot but ok...
              continue

      #result["thumbnail"] is normally big enough for most purposes
      #result["width"], result["height"] are for the full size img in result["image"]
      #result["image"] url to full size img on orig site (so may be less reliable) 
      #result["url"], result["title"].encode('utf-8') from the page the img came from
      
      for result in data["results"]:
        if(img_size == ImgSize.Thumbs): links.append(result["thumbnail"])
        else:                       links.append(result["image"])

        if(max_results is not None):
          if(len(links) >= max_results) : return links

      if "next" not in data:
          #no next page, all done
          return links

      requestUrl = BASE_URL + data["next"]

##########################################################################################
# searching & downloading
##########################################################################################
def google_search(label: str, keywords: str, max_results: int=100) -> None:
  '''run a google search and download the images'''
  print("Google search: ", keywords)
  links = google_scrape_urls(keywords,max_results)
  download_urls(label, links)

def duckduckgo_search(label: str, keywords: str, max_results: int=100,
                           img_size: ImgSize=ImgSize.Thumbs, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All) -> None:
  '''run a duckduckgo search and download the images'''
  print("Duckduckgo search:", keywords)
  links = duckduckgo_scrape_urls(keywords, max_results, img_size, img_type, img_layout, img_color)
  download_urls(label, links)

def download_urls(label: str, links: list) -> None:
  '''downloads urls into the folder for that label'''
  if(len(links) == 0):
    print("Nothing to download!"); return

  folder = Path(BASE_FOLDER)/label
  folder.mkdir(parents=True, exist_ok=True)

  print("Downloading", len(links), "results into", folder)

  bar = widgets.IntProgress(0, 0, len(links) - 1)
  display(bar)

  i = 1
  mk_fp = lambda i: folder/(str(i).zfill(3) + ".jpg")
  is_file = lambda i: mk_fp(i).exists()
  while is_file(i): i += 1 # don't overwrite previous searches
  
  for link in links:
      try:
        resp = requests.get(link)      
        fp = mk_fp(i)
        fp.write_bytes(resp.content)

        try:
          img = PImage.open(fp)
          img.verify()
          img.close()
        except Exception as e:
          # print(e)
          print(fp, "is invalid")
          fp.unlink()
      except:
        print("Exception occured while retrieving", link)

      i += 1
      bar.value += 1

  bar.bar_style = "success"

def save_urls(filename: str, scrape_func: Callable, label: str, keywords: str, max_results: int) -> None:
  '''run a search and concat the urls to a csv'''
  fp = Path(filename)
  if(fp.exists() == False):
    df = pd.DataFrame(columns=["URL", "Label"])
    df.to_csv(filename, index=False)

  urls = scrape_func(keywords, max_results)
  rows = []

  for url in urls:
    rows.append({"URL":url, "Label":label})
    
  df = pd.concat([pd.read_csv(filename), pd.DataFrame(rows)]) 
  df.to_csv(filename, index=False)

##########################################################################################
# moving files around
##########################################################################################
def download_file(filename: str) -> None:
  '''trigger a file download from colab to local system'''
  files.download(filename)

def transfer_to_drive(filename: str, dest_folder: str="Datasets") -> None:
  '''transfer file from colab runtime to google drive'''
  drive.mount("/content/drive") 
  folder = Path("/content/drive/My Drive")/dest_folder
  folder.mkdir(parents=True, exist_ok=True)
  
  shutil.copyfile(filename, str(folder/filename))

In [3]:
# Delete all the files
!rm -r -f BASE_FOLDER/*

In [None]:
## Build the search
## Build your download

# change your zip name and run some searches
# help and options are in the hidden cell at the top

params = {
    "max_results": 150,             # can go higher, 477 at the time of writing
    "img_size":    ImgSize.Thumbs, 
    "img_type":    ImgType.Photo,
    "img_layout":  ImgLayout.Square,
    "img_color":   ImgColor.All
}

ZIP_NAME = "images.zip" 
duckduckgo_search("grizzly", "grizzly", **params)
duckduckgo_search("teddy", "teddy bear", **params)
duckduckgo_search("black", "black bear", **params)
duckduckgo_search("polar", "polar bear", **params)
# you can also use google_search() if you prefer or if the ddg code breaks.


Duckduckgo search: grizzly
Downloading 150 results into images/grizzly


IntProgress(value=0, max=149)

In [None]:
!rm -f {ZIP_NAME}
!zip -q -r {ZIP_NAME} images
download_file(ZIP_NAME)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# copy to google drive 
#transfer_to_drive(ZIP_NAME, dest_folder="Datasets")

In [None]:
!rm -r -f images/*