# Resize image for training

In [1]:
from PIL import Image, UnidentifiedImageError
import os
import re
import sys
import concurrent
sys.path.insert(0, '../../')
from data_loader import resolve_env_variable, load_image_file_raw, get_image_files

Use pillow to resize images to 128x128 for training.
(images larger than 128x128 will be resized to 128x128, images smaller than will be ignored)

In [2]:
# risize an image using pillow
def resize_image(image, size, image_path):
    try:
      return image.resize(size)
    except OSError as e:
      print(e)
      if 'image file is truncated' in e.__repr__() or (e.strerror is not None and 'image file is truncated' in e.strerror):
        print("Image file is truncated. Skipping file: " + os.path.basename(image_path))
        return None
      raise e
    except SyntaxError as e:
      print(e)
      if 'broken PNG file' in e.__repr__() or (e.strerror is not None and 'broken PNG file' in e.strerror):
        print("Broken PNG file. Skipping file: " + os.path.basename(image_path))
        return None
      raise e

# save an image using pillow
def save_image(image, image_path):
    image.save(image_path)

# convert an image to grayscale using pillow
def convert_to_grayscale(image):
    return image.convert('L')

In [3]:
allow_env = True
path_to_images = resolve_env_variable('../../1_data_collection/.data', 'FILE_LOCATION', allow_env, 'IMAGE_FILE_LOCATION')
path_to_processed_images = resolve_env_variable('../../1_data_collection/.data', 'IMAGE_FILE_LOCATION', allow_env, 'FILE_LOCATION')
num_workers = 16

In [4]:
image_paths = get_image_files(path_to_images)

print(f"{len(image_paths)} files")

# do not scale images smaller than this threshold
SIZE_THRESHOLD = 500

already_small = 0

def do_resize(image_path):
  global already_small
  try:
    with load_image_file_raw(image_path) as image:
      if image.size[0] < SIZE_THRESHOLD or image.size[1] < SIZE_THRESHOLD:
        already_small += 1
        return
      resized_image = resize_image(image, (image.size[0]//4, image.size[1]//4), image_path)
      if resized_image is None:
        return
      # Save the resized image
      save_image(resized_image, os.path.join(path_to_processed_images, os.path.basename(image_path)))
  except UnidentifiedImageError as e:
    print("Cannot identify image file. Skipping file: " + os.path.basename(image_path))
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Download and log every 100 files using a generator
    # First initialize the generator
    current_file = 0
    for _ in executor.map(do_resize, image_paths):
      current_file += 1
      if (current_file and current_file % 1000 == 0) or current_file == len(image_paths):
        print('Resized ' + str(current_file - already_small) + ' files')

print('Skipped ' + str(already_small) + ' already resized files')

66125 files
image file is truncated
Image file is truncated. Skipping file: geoguessr_location_singleplayer_nq0achulcNHF9VRB_1.png
Resized 1000 files
Resized 2000 files
Resized 3000 files
image file is truncated
Image file is truncated. Skipping file: geoguessr_location_singleplayer_q80iZf04I90rStSd_3.png
Resized 4000 files
Resized 5000 files
Resized 6000 files
Resized 7000 files
Resized 8000 files
Resized 9000 files
Resized 10000 files
Resized 11000 files
Resized 12000 files
Resized 13000 files
Resized 14000 files
Skipped 0 already resized files
