In [2]:
import time
import sys
import json
import re
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import multiprocessing as mp

In [3]:
ORIGINAL = False
QUERY_SEARCH = None
QUERY_GROUPURL = 'https://www.flickr.com/groups/scenery/pool/'
QUERY_GROUPID = '78249294@N00'
IMG_RESOLUTION_URL = "url_o" if ORIGINAL else "url_l"  #url_c,url_l,url_m,url_n,url_q,url_s,url_sq,url_t,url_z

METADATA_JSON_FOLDER = './download/info'
URL_JSON_FOLDER = './download/url'
IMG_FOLDER = './download/newimages'

DOWNLOADED = set()


with open('credentials.json') as infile:
    creds = json.load(infile)
    KEY = creds['KEY']
    SECRET = creds['SECRET']

for folder in [METADATA_JSON_FOLDER, URL_JSON_FOLDER, IMG_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

for filename in os.listdir(IMG_FOLDER):
    id = filename.split('.')[0]
    DOWNLOADED.add(id)

In [4]:
def get_photo_metadata(page=1):
    params = {
        'content_type': '7',
        'per_page': '500',
        'media': 'photos',
        'format': 'json',
        'advanced': 1,
        'nojsoncallback': 1,
        'extras': f'{IMG_RESOLUTION_URL},o_dims',
        'page': page,
        'api_key': KEY
    }

    if QUERY_SEARCH is not None:
        params['method'] = 'flickr.photos.search',
        params['text'] = QUERY_SEARCH
    elif QUERY_GROUPID is not None:
        params['method'] = 'flickr.groups.pools.getPhotos',
        params['group_id'] = QUERY_GROUPID

    results = requests.get('https://api.flickr.com/services/rest', params=params).json()
    return results


def get_group_id_from_url(url):
    if url is None:
        return None
    params = {
        'method': 'flickr.urls.lookupGroup',
        'url': url,
        'format': 'json',
        'api_key': KEY,
        'nojsoncallback': 1
    }
    results = requests.get('https://api.flickr.com/services/rest', params=params).json()
    return results['group']['id']

In [7]:
def download_metadata(page):
    file_path = os.path.join(METADATA_JSON_FOLDER, f'page{page}.json')
    if os.path.exists(file_path):
        return
    
    results = get_photo_metadata(page)          
    with open(file_path, 'w') as json_file:
        json.dump(results, json_file)
        print(f'Metadata of page {page} dumped')


def save_photo_id_and_urls(page):
    file_path = os.path.join(URL_JSON_FOLDER, f'page{page}.json')
    if os.path.exists(file_path):
        return
    
    photo_urls = []
    with open(os.path.join(METADATA_JSON_FOLDER, f'page{page}.json'), 'r') as json_file:
        photos = json.load(json_file)['photos']['photo']
        for photo in photos:
            if IMG_RESOLUTION_URL in photo.keys():
                photo_urls.append((photo['id'], photo[IMG_RESOLUTION_URL]))

    with open(file_path, 'w') as json_file:
        json.dump(photo_urls, json_file)
        print(f'Photo urls of page {page} dumped')
        return photo_urls


def get_photo_id_and_urls(page):
    with open(os.path.join(URL_JSON_FOLDER, f'page{page}.json'), 'r') as json_file:
        photo_id_and_urls = json.load(json_file)
        return photo_id_and_urls


def download_file(photo_id_url_pair):    
    p_id, p_url = photo_id_url_pair
    extension = p_url.split('.')[-1]
    filename = '{}.{}'.format(p_id, extension)
    filepath = os.path.join(IMG_FOLDER, filename)
    if os.path.exists(filepath):
        return

    try:
        r = requests.get(p_url, stream=True)
        with open(filepath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f'{filename} downloaded')
    except Exception as e:
        print(f'{filename} download failed, "{e}"')


def multi_thread_download_image(page):
    photo_id_and_urls = get_photo_id_and_urls(page)
    
    # multi_thread_download_image images
    pool = mp.Pool(processes=4)
    for photo_id, photo_url in photo_id_and_urls:
        pool.apply_async(download_file, args=((photo_id, photo_url),))
    pool.close()
    pool.join()
    print(f'Multithread download of page {page} end.')

In [8]:
if QUERY_SEARCH is None and QUERY_GROUPURL is None:
    sys.exit('Must specify a search term or group id')

# START_PAGE = 46
# FINAL_PAGE = 60
# for page in range(START_PAGE, FINAL_PAGE + 1):
#     download_metadata(page)
#     save_photo_id_and_urls(page)
#     multi_thread_download_image(page)

multi_thread_download_image(50)

Multithread download of page 50 end.
44896919255.jpg downloaded
31977773958.jpg downloaded
44031626560.jpg downloaded
44032156100.jpg downloaded
44363650285.jpg downloaded
30908913237.jpg downloaded
44032838170.jpg downloaded
31954094538.jpg downloaded
31976973248.jpg downloaded


Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users/sephidator/opt/anaconda3/lib/python3.7/multiprocessing/process.

In [None]:
# photo_id_and_urls = []
# for page in range(1, 18 + 1):
#     photo_id_and_urls += get_photo_id_and_urls(page)
# 
# # multi_thread_download_image images
# pool = mp.Pool(processes=4)
# for photo_id, photo_url in photo_id_and_urls:
#     pool.apply_async(download_file, args=((photo_id, photo_url),))
# pool.close()
# pool.join()
# print(f'Multithread download of page end.')