We need to be able to tell if we've already saved an image, and only save images we haven't seen before.

In [1]:
import hashlib
import requests
def request_and_write_image(url):
    try:
        r = requests.get(url, stream=True)
    except Exception as e:
        print(url, e)
        return
#     print(r.status_code, "getting image", url)
    if not r.ok:
        return
    # write locally
    # won't work because Google won't give you the real URL
#     fname = url.split('/')[-1]
    fname = hashlib.md5(url.encode("utf-8")).hexdigest()
    with open('temp/'+fname, 'wb') as f:
        for block in r.iter_content(1024):
            if not block:
                break
            f.write(block)
#     return _write_public('temp', spaces_fname)
    return fname

In [2]:
import imagehash
from PIL import Image

## testing different hashing algorithms

Method: 
- do a search for something that has several of the same picture in the results that are likely to have different byte values
- visualize them with ipyplot
- group the pictures by hash

In [3]:
from bs4 import BeautifulSoup
import ipyplot
def get_urlset(search_term):
    google_template = 'https://www.google.com/search?q={}&tbm=isch'
    r = requests.get(google_template.format(search_term))
    soup = BeautifulSoup(r.text)
    return [tag.get('src') for tag in soup.find_all('img') if tag.get('src')[:4] == 'http']

In [32]:
def hash_image(fname):
    return imagehash.phash(Image.open(fname))

In [33]:
urls = get_urlset('Milk Drop Coronet')
fnames = [f'temp/{request_and_write_image(url)}' for url in urls]

Let's sort the images by hash so we see which ones are similar.

In [34]:
sorted_by_hash = sorted([(fname,str(hash_image(fname))) for fname in fnames], key=lambda x: str(x[1]))

In [35]:
sorted_by_hash[1]

('temp/d26dd9e1ec298f2ede4f6cc0c2d95c48', '8f7ac53a659863a4')

In [36]:
ipyplot.plot_images(fnames, [img_hash for fname,img_hash in sorted_by_hash], img_width=150)

Of course, search engines employ image hashing as well, and Google probably filters out results of the exact same image.

One issue we have is that we don't know the type of image we've got saved. We want to know so we can give it the proper extension and it can be opened by anything receiving the filename in an API query.

In [37]:
import imghdr

In [40]:
import os
for fname in os.listdir('temp'):
    ext = imghdr.what('temp/'+fname)
    print(fname, ext)

007bc4d1ffb86ba15347818bec2d6d8b jpeg
02ce873a9fd79585333a2b27b107aeae jpeg
03d887a0b0d9ac95ab8b0af2fd51efe9 jpeg
047e27959cf9b4e13f112d3e1ee07d90 jpeg
057b36f97f6649772fc542b4f0b308bf jpeg
072944dca363e5b5c5ed521c3d7d986e jpeg
073bf4012a1e351a4d32eb050f43939f jpeg
075e98c23994320ab0777cb8bf79a734 jpeg
07d458ae262daca51e05c712a60111ec jpeg
0c6e9d4c1585649260e4814fa4a735be jpeg
1043f47d9621bb22a949442b33eb59ce jpeg
11388aa950da672c7c0244e3f5aa7d54 jpeg
154c930e51f1930ac575086b00608e58 jpeg
179046e913b5295740142b09d0dd3ea5 jpeg
1857d69d4cbd11051393f86845ebf3d6 jpeg
1861bd1e2c80c9d64702d6760fbae0f1 jpeg
1ae1f8cdc1d4100269996023c11efd44 jpeg
1d074865014710db22b30d9ae5bc3478 jpeg
259d4cf47ff49d349efa0fb6bcda9367 jpeg
25e5d8d60f8312596e2a81ee8f8ca457 jpeg
25eba361f8711c0da1af124df1710293 jpeg
2793c4a9eb084a6f9488e3160063d7a2 jpeg
28625864bbf5231d96eee804a048a2b6 jpeg
288fa8f48cf14678a67dc54f9e6d12a0 jpeg
309f9b08acd1ae3660fdbc1e592f4bb7 jpeg
3173c73e172e5c152b642dc724cf9dea jpeg
38d45bfe999d