We need to be able to tell if we've already saved an image, and only save images we haven't seen before.

In [1]:
import hashlib
import requests
def request_and_write_image(url):
    try:
        r = requests.get(url, stream=True)
    except Exception as e:
        print(url, e)
        return
#     print(r.status_code, "getting image", url)
    if not r.ok:
        return
    # write locally
    # won't work because Google won't give you the real URL
#     fname = url.split('/')[-1]
    fname = hashlib.md5(url.encode("utf-8")).hexdigest()
    with open('thumbnails/'+fname, 'wb') as f:
        for block in r.iter_content(1024):
            if not block:
                break
            f.write(block)
#     return _write_public('temp', spaces_fname)
    return fname

In [2]:
import imagehash
from PIL import Image

In [3]:
import os
try:
    os.mkdir('thumbnails')
except:
    pass

## testing different hashing algorithms

Method: 
- do a search for something that has several of the same picture in the results that are likely to have different byte values
- visualize them with ipyplot
- group the pictures by hash

In [4]:
from bs4 import BeautifulSoup
import ipyplot
def get_urlset(search_term):
    google_template = 'https://www.google.com/search?q={}&tbm=isch'
    r = requests.get(google_template.format(search_term))
    soup = BeautifulSoup(r.text)
    return [tag.get('src') for tag in soup.find_all('img') if tag.get('src')[:4] == 'http']

In [5]:
def hash_image(fname):
    return imagehash.phash(Image.open(fname))

In [6]:
urls = get_urlset('Milk Drop Coronet')

Save them all into the thumbnails folder with a temporary name.

In [7]:
fnames = [f'thumbnails/{request_and_write_image(url)}' for url in urls]

Let's sort the images by hash so we see which ones are similar.

In [8]:
sorted_by_hash = sorted([(fname,str(hash_image(fname))) for fname in fnames], key=lambda x: str(x[1]))

In [9]:
sorted_by_hash[1:3]

[('thumbnails/d26dd9e1ec298f2ede4f6cc0c2d95c48', '8f7ac53a659863a4'),
 ('thumbnails/82069861ee5d37648f4bde7ee7ea2f9c', '916d3a33659a26f1')]

Of course, search engines employ image hashing as well, and Google probably filters out results of the exact same image.

One issue we have is that we don't know the type of image we've got saved. We want to know so we can give it the proper extension and it can be opened by anything receiving the filename in an API query.

In [10]:
import imghdr

In [11]:
import os
for fname in os.listdir('thumbnails'):
    ext = imghdr.what('thumbnails/'+fname)
#     print(fname, ext)
    if ext == 'jpeg':
        ext = 'jpg'
    try:
        os.rename('thumbnails/'+fname, f'thumbnails/{fname}.{ext}')
    except FileExistsError:
        pass

We're also going to want to resize them to cap out the size of the photos we're storing.

In [12]:
for fname in os.listdir('thumbnails'):
    im = Image.open('thumbnails/'+fname)  

    width, height = im.size
    
    # ensure total size of any image <= 1 megapixels
    changed = False
    while width*height > 1_000_000:
        width //= 2
        height //= 2
        changed = True
        
    if changed:
        im = im.resize(newsize)
        im.save('thumbnails/'+fname)
        print('resized', fname)

In [14]:
fnames = ['thumbnails/'+fname for fname in os.listdir('thumbnails')]
ipyplot.plot_images(fnames, img_width=200)