In [2]:
import multiprocessing
import os
from io import BytesIO
from urllib import request
import pandas as pd
import re
import tqdm
from PIL import Image

proxies = {
    'https': 'https://127.0.0.1:8123',
    'http': 'http://127.0.0.1:8123'
}
# construct a new opener using your proxy settings
opener = request.build_opener(request.ProxyHandler(proxies))
# install the openen on the module-level
request.install_opener(opener)


# set files and dir
#DATA_FRAME, OUT_DIR = pd.read_csv('data/train.csv'), 'data/train_img'  # recognition challenge
DATA_FRAME, OUT_DIR = pd.read_csv('data/test.csv'), 'data/test_img'  # recognition challenge

# DATA_FRAME, OUT_DIR = pd.read_csv('../input/index.csv'), '../input/index'  # retrieval challenge
# DATA_FRAME, OUT_DIR = pd.read_csv('../input/test.csv'), '../input/test'  # test data

# preferences
TARGET_SIZE = 128  # image resolution to be stored
IMG_QUALITY = 90  # JPG quality
NUM_WORKERS = 8  # Num of CPUs

In [3]:
DATA_FRAME.url.apply(lambda x: x.split('/')[-2]).value_counts().head()

s1600          110434
s1600-w1600      4142
rj                196
s1600-w1280       182
s1600-w960        179
Name: url, dtype: int64

In [4]:
def overwrite_urls(df):
    def reso_overwrite(url_tail, reso=TARGET_SIZE):
        pattern = 's[0-9]+'
        search_result = re.match(pattern, url_tail)
        if search_result is None:
            return url_tail
        else:
            return 's{}'.format(reso)

    def join_url(parsed_url, s_reso):
        parsed_url[-2] = s_reso
        return '/'.join(parsed_url)

    parsed_url = df.url.apply(lambda x: x.split('/'))
    train_url_tail = parsed_url.apply(lambda x: x[-2])
    resos = train_url_tail.apply(lambda x: reso_overwrite(x, reso=TARGET_SIZE))

    overwritten_df = pd.concat([parsed_url, resos], axis=1)
    overwritten_df.columns = ['url', 's_reso']
    df['url'] = overwritten_df.apply(lambda x: join_url(x['url'], x['s_reso']), axis=1)
    return df


def parse_data(df):
    key_url_list = [line[:2] for line in df.values]
    return key_url_list


def download_image(key_url):
    (key, url) = key_url
    filename = os.path.join(OUT_DIR, '{}.jpg'.format(key))

    if os.path.exists(filename):
        print('Image {} already exists. Skipping download.'.format(filename))
        return 0

    try:
        response = request.urlopen(url)
        image_data = response.read()
    except:
        print('Warning: Could not download image {} from {}'.format(key, url))
        return 1

    try:
        pil_image = Image.open(BytesIO(image_data))
    except:
        print('Warning: Failed to parse image {}'.format(key))
        return 1

    try:
        pil_image_rgb = pil_image.convert('RGB')
    except:
        print('Warning: Failed to convert image {} to RGB'.format(key))
        return 1

    try:
        pil_image_resize = pil_image_rgb.resize((TARGET_SIZE, TARGET_SIZE))
    except:
        print('Warning: Failed to resize image {}'.format(key))
        return 1

    try:
        pil_image_resize.save(filename, format='JPEG', quality=IMG_QUALITY)
    except:
        print('Warning: Failed to save image {}'.format(filename))
        return 1

    return 0


def loader(df):
    if not os.path.exists(OUT_DIR):
        os.mkdir(OUT_DIR)

    key_url_list = parse_data(df)
    pool = multiprocessing.Pool(processes=NUM_WORKERS)
    failures = sum(tqdm.tqdm(pool.imap_unordered(download_image, key_url_list),
                             total=len(key_url_list)))
    print('Total number of download failures:', failures)
    pool.close()
    pool.terminate()

In [None]:
loader(overwrite_urls(DATA_FRAME))

  0%|          | 30/117703 [00:03<4:04:53,  8.01it/s]



  0%|          | 40/117703 [00:04<3:48:03,  8.60it/s]



  0%|          | 42/117703 [00:05<4:05:11,  8.00it/s]



  0%|          | 48/117703 [00:05<4:02:31,  8.09it/s]



  0%|          | 103/117703 [00:12<3:50:55,  8.49it/s]



  0%|          | 184/117703 [00:21<3:51:56,  8.44it/s]

In [None]:
##Total number of download failures: 50570

##total of test: 108333/117703