In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/'Georgia Tech'/'CS 7643'/'DL Project'/code/data/

/content/drive/.shortcut-targets-by-id/1Vhu7c9TtXf-INnjjgmUD94b4o35wSPWM/DL Project/code/data


In [None]:
import os
import shutil
import random
import math
from PIL import Image
import numpy as np
import pprint
from tqdm.notebook import tqdm

# Generate the Augmentations Dataset

### Helpers

In [None]:
def load_images(directory):
    """ Load images from a directory where images are stored in subfolders named after the artist.
        Returns a dictionary with artist names as keys and lists of image paths as values. """
    images = {}
    # Iterate over all directories within the main directory
    for artist in os.listdir(directory):
        artist_path = os.path.join(directory, artist)
        if os.path.isdir(artist_path):  # Ensure it's a directory
            # Iterate over all images in the artist's directory
            for filename in os.listdir(artist_path):
                if filename.endswith('.jpg'):
                    # Initialize the list if this is the first image found for the artist
                    if artist not in images:
                        images[artist] = []
                    # Append the full path of the image
                    images[artist].append(os.path.join(artist_path, filename))
    return images
def clear_directory(directory):
    """ Remove all files in the specified directory. """
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)

def copy_input_to_output(input_directory, output_directory):
    """ Copies all folders and images from the input directory to the output directory,
        maintaining the directory structure. """

    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Walk through the input directory
    for root, dirs, files in os.walk(input_directory):
        # For each directory in the input directory
        for name in dirs:
            src_path = os.path.join(root, name)
            dst_path = src_path.replace(input_directory, output_directory, 1)
            if not os.path.exists(dst_path):
                os.makedirs(dst_path)

        # For each file in the current directory
        total_files = [file for file in files if file.endswith('.jpg')]
        progress_bar = tqdm(total=len(total_files), desc=f'Copying images from {root}')
        for filename in total_files:
            source_path = os.path.join(root, filename)
            destination_path = source_path.replace(input_directory, output_directory, 1)
            if not os.path.exists(os.path.dirname(destination_path)):
                os.makedirs(os.path.dirname(destination_path))
            shutil.copy(source_path, destination_path)
            progress_bar.update(1)

        progress_bar.close()

    print("All images and directories have been copied to the output directory.")


In [None]:
def rotate_image(image, degrees):
    """ Rotate the image by a given degree and return the augmented image and the description of the operation. """
    return image.rotate(degrees, expand=True)

def scale_image(image, scale_factor):
    """ Scale the image by a given scale factor and return the augmented image and the description of the operation. """
    width, height = image.size
    return image.resize((int(width * scale_factor), int(height * scale_factor)))

def flip_image(image):
    """ Flip the image horizontally and return the augmented image and the description of the operation. """
    return image.transpose(Image.FLIP_LEFT_RIGHT)

def mixup_image(image1, image2):
    """ Mixup two images by resizing the second to the dimensions of the first, then averaging their arrays.
        This ensures compatibility between images of different dimensions and color modes.
        Converts grayscale images to RGB if necessary.
        Returns the mixed image and a description of the operation. """
    # Resize image2 to match the dimensions of image1
    image2 = image2.resize(image1.size, Image.LANCZOS)

    # Ensure both images are in the same color mode
    if image1.mode != image2.mode:
        if image1.mode == 'RGB' and image2.mode == 'L':
            # Convert image2 from grayscale to RGB
            image2 = image2.convert('RGB')
        elif image1.mode == 'L' and image2.mode == 'RGB':
            # Convert image1 from grayscale to RGB
            image1 = image1.convert('RGB')

    # Convert images to arrays
    array1 = np.array(image1)
    array2 = np.array(image2)

    # Mixup: average the arrays
    mixed_array = (array1 * 0.5 + array2 * 0.5).astype(np.uint8)

    # Convert the mixed array back to an image
    return Image.fromarray(mixed_array)

In [None]:
def augment_image(image_path, image_paths, augmentations, output_directory, artist_name):
    """ Apply a series of augmentations to the image and save it with a new filename that includes the augmentations applied in a specified directory. """
    image = Image.open(image_path)
    original_path = os.path.splitext(os.path.basename(image_path))[0]
    augmentation_descriptions = []

    for augmentation in augmentations:
        if augmentation == 'rotate':
            degrees = random.choice([90, 180, 270])
            image = rotate_image(image, degrees)
            description = f"rotate{degrees}"
        elif augmentation == 'scale':
            scale_factor = random.choice([0.75, 1.25])
            image = scale_image(image, scale_factor)
            description = f"scale{int(scale_factor*100)}"
        elif augmentation == 'flip':
            image = flip_image(image)
            description = "flip"
        elif augmentation == 'mixup' and len(image_paths) > 1:
            other_image_path = random.choice([path for path in image_paths if path != image_path])
            other_image = Image.open(other_image_path)
            image = mixup_image(image, other_image)
            description = "mixup"

        augmentation_descriptions.append(description)

    artist_directory = os.path.join(output_directory, artist_name)
    if not os.path.exists(artist_directory):
        os.makedirs(artist_directory)

    new_filename = f"{original_path}_{'_'.join(augmentation_descriptions)}.jpg"
    output_path = os.path.join(artist_directory, new_filename)
    image.save(output_path)
    return output_path

### Generation

The goal here is to get a more even number of images for the various artists along with more training data overall.

The ending number of images per artist will be the total number of images / the number of artists up to a maximum of 4 times that artist's original number of images. If the artist already has more than the ending number of images / the number of artists, they will have no augmented images added.

For example, take Caravaggio. He starts with 55 images. While the total number of images (30,000) / the number of artists (51) is 588, his maximum will be 55 * 4 = 220 images as not to dilute the training dataset with too many augmented images of his work.

Since Alfred Sisley starts with 259 images, he can go up to the 588 image limit since 259 * 4 = 1036 images.

Since Vincent Van Gogh has 877 images, he will have no augmented images added to the dataset.

In [None]:
input_directory = 'original_data'
# input_directory = 'sample/Claude_Monet'
output_directory = 'augmented_data' # DO NOT UNCOMMENT - will delete existing dataset here

# Ensure output directory exists
if os.path.exists(output_directory):
    # Clear existing files in the output directory
    clear_directory(output_directory)
else:
    os.makedirs(output_directory)


images_by_artist = load_images(input_directory)
original_images_by_artist = images_by_artist.copy()

total_images = sum(len(imgs) for imgs in images_by_artist.values())
target_total_images = total_images * 3

# Calculate the target number of images per artist based on their initial count
artist_targets = {artist: max(len(imgs), min(len(imgs) * 4, int(target_total_images / len(images_by_artist)))) for artist, imgs in images_by_artist.items()}
all_augmented_images = []

debug = False
if debug:
  artist_counts = {artist: len(imgs) for artist, imgs in images_by_artist.items()}
  pprint.pprint(artist_counts)

Generate the images

In [None]:
# Prioritize artists who are furthest from their target
progress_bar = tqdm(total=target_total_images - total_images, desc='Generating Images')
while any(len(images_by_artist[artist]) < target for artist, target in artist_targets.items()):
    for artist, target in sorted(artist_targets.items(), key=lambda x: len(images_by_artist[x[0]])/x[1]):
        if len(all_augmented_images) >= target_total_images:
            break
        # Always use the original images for generating new augmented images
        image_paths = original_images_by_artist[artist]
        augmentations = random.sample(['rotate', 'scale', 'flip', 'mixup'], k=random.choice([1, 2]))
        chosen_image = random.choice(image_paths)
        new_image = augment_image(chosen_image, image_paths, augmentations, output_directory, artist)
        all_augmented_images.append(new_image)
        images_by_artist[artist].append(new_image)

        progress_bar.update(1)
progress_bar.close()

Generating Images:   0%|          | 0/8066 [00:00<?, ?it/s]

In [None]:
copy_input_to_output(input_directory, output_directory)

Copying images from original_data: 0it [00:00, ?it/s]

Copying images from original_data/Vincent_van_Gogh:   0%|          | 0/877 [00:00<?, ?it/s]

Copying images from original_data/Rembrandt:   0%|          | 0/262 [00:00<?, ?it/s]

Copying images from original_data/Salvador_Dali:   0%|          | 0/139 [00:00<?, ?it/s]

Copying images from original_data/Sandro_Botticelli:   0%|          | 0/164 [00:00<?, ?it/s]

Copying images from original_data/Peter_Paul_Rubens:   0%|          | 0/141 [00:00<?, ?it/s]

Copying images from original_data/Pierre-Auguste_Renoir:   0%|          | 0/336 [00:00<?, ?it/s]

Copying images from original_data/Piet_Mondrian:   0%|          | 0/84 [00:00<?, ?it/s]

Copying images from original_data/Raphael:   0%|          | 0/109 [00:00<?, ?it/s]

Copying images from original_data/Michelangelo:   0%|          | 0/49 [00:00<?, ?it/s]

Copying images from original_data/Pablo_Picasso:   0%|          | 0/439 [00:00<?, ?it/s]

Copying images from original_data/Paul_Gauguin:   0%|          | 0/311 [00:00<?, ?it/s]

Copying images from original_data/Jan_van_Eyck:   0%|          | 0/81 [00:00<?, ?it/s]

Copying images from original_data/Jackson_Pollock:   0%|          | 0/24 [00:00<?, ?it/s]

Copying images from original_data/Leonardo_da_Vinci:   0%|          | 0/143 [00:00<?, ?it/s]

Copying images from original_data/Frida_Kahlo:   0%|          | 0/120 [00:00<?, ?it/s]

Copying images from original_data/Gustav_Klimt:   0%|          | 0/117 [00:00<?, ?it/s]

Copying images from original_data/Claude_Monet:   0%|          | 0/73 [00:00<?, ?it/s]

Copying images from original_data/Andy_Warhol:   0%|          | 0/181 [00:00<?, ?it/s]

Copying images from original_data/Caravaggio:   0%|          | 0/55 [00:00<?, ?it/s]

Copying images from original_data/Albrecht_Dürer:   0%|          | 0/328 [00:00<?, ?it/s]

All images and directories have been copied to the output directory.


In [None]:
!ls augmented_data

# [len(images_by_artist[artist]) < target for artist, target in artist_targets.items()]
# artist_targets.items()
# print(len(all_augmented_images), target_total_images)
# len(images_by_artist['Claude_Monet'])

Albrecht_Dürer	Frida_Kahlo	 Leonardo_da_Vinci  Peter_Paul_Rubens	   Rembrandt
Andy_Warhol	Gustav_Klimt	 Michelangelo	    Pierre-Auguste_Renoir  Salvador_Dali
Caravaggio	Jackson_Pollock  Pablo_Picasso	    Piet_Mondrian	   Sandro_Botticelli
Claude_Monet	Jan_van_Eyck	 Paul_Gauguin	    Raphael		   Vincent_van_Gogh


In [None]:
!ls augmented_data/Albrecht_Dürer | wc -l

759


# Generate the Over/Under Sampled Dataset

In [None]:
def calculate_target(images_by_artist, strategy='mean'):
    """Calculate target number of images per artist based on a strategy."""
    image_counts = [len(images) for images in images_by_artist.values()]
    if strategy == 'mean':
        return int(sum(image_counts) / len(image_counts))
    elif strategy == 'median':
        return int(sorted(image_counts)[len(image_counts) // 2])
    elif strategy == 'max':
        return max(image_counts)
    elif strategy == 'min':
        return min(image_counts)
    else:
        raise ValueError("Unsupported strategy. Choose from 'mean', 'median', 'max', or 'min'.")

In [None]:
def oversample_images(images_by_artist, target, output_directory):
    """Oversample images by artist to reach the target count."""
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    progress_bar = tqdm(total=sum(max(0, target - len(paths)) for paths in images_by_artist.values()), desc='Oversampling Images')

    for artist, images in images_by_artist.items():
        current_count = len(images)
        if current_count < target:
            needed = target - current_count
            artist_directory = os.path.join(output_directory, artist)  # Path to the artist's specific folder
            if not os.path.exists(artist_directory):
                os.makedirs(artist_directory)  # Create the folder if it doesn't exist

            for _ in range(needed):
                chosen_image = random.choice(images)
                # Generate a unique file name for the copied image
                new_filename = f"{artist}_{random.randint(1000, 9999)}.jpg"
                destination_path = os.path.join(artist_directory, new_filename)
                shutil.copy(chosen_image, destination_path)
                progress_bar.update(1)

    progress_bar.close()
    print("Oversampling complete.")

In [None]:
# !rm -rf oversampled_data # Uncomment as needed

In [None]:
input_directory = 'original_data'
output_directory = 'oversampled_data'

images_by_artist = load_images(input_directory)
target_count = calculate_target(images_by_artist, 'mean')
oversample_images(images_by_artist, target_count, output_directory)

Oversampling Images:   0%|          | 0/1334 [00:00<?, ?it/s]

Oversampling complete.


In [51]:
copy_input_to_output(input_directory, output_directory)

Copying images from original_data: 0it [00:00, ?it/s]

Copying images from original_data/Vincent_van_Gogh:   0%|          | 0/877 [00:00<?, ?it/s]

Copying images from original_data/Rembrandt:   0%|          | 0/262 [00:00<?, ?it/s]

Copying images from original_data/Salvador_Dali:   0%|          | 0/139 [00:00<?, ?it/s]

Copying images from original_data/Sandro_Botticelli:   0%|          | 0/164 [00:00<?, ?it/s]

Copying images from original_data/Peter_Paul_Rubens:   0%|          | 0/141 [00:00<?, ?it/s]

Copying images from original_data/Pierre-Auguste_Renoir:   0%|          | 0/336 [00:00<?, ?it/s]

Copying images from original_data/Piet_Mondrian:   0%|          | 0/84 [00:00<?, ?it/s]

Copying images from original_data/Raphael:   0%|          | 0/109 [00:00<?, ?it/s]

Copying images from original_data/Michelangelo:   0%|          | 0/49 [00:00<?, ?it/s]

Copying images from original_data/Pablo_Picasso:   0%|          | 0/439 [00:00<?, ?it/s]

Copying images from original_data/Paul_Gauguin:   0%|          | 0/311 [00:00<?, ?it/s]

Copying images from original_data/Jan_van_Eyck:   0%|          | 0/81 [00:00<?, ?it/s]

Copying images from original_data/Jackson_Pollock:   0%|          | 0/24 [00:00<?, ?it/s]

Copying images from original_data/Leonardo_da_Vinci:   0%|          | 0/143 [00:00<?, ?it/s]

Copying images from original_data/Frida_Kahlo:   0%|          | 0/120 [00:00<?, ?it/s]

Copying images from original_data/Gustav_Klimt:   0%|          | 0/117 [00:00<?, ?it/s]

Copying images from original_data/Claude_Monet:   0%|          | 0/73 [00:00<?, ?it/s]

Copying images from original_data/Andy_Warhol:   0%|          | 0/181 [00:00<?, ?it/s]

Copying images from original_data/Caravaggio:   0%|          | 0/55 [00:00<?, ?it/s]

Copying images from original_data/Albrecht_Dürer:   0%|          | 0/328 [00:00<?, ?it/s]

All images and directories have been copied to the output directory.


In [52]:
!ls oversampled_data
!ls oversampled_data | wc -l

!ls original_data/Claude_Monet | wc -l
!ls oversampled_data/Claude_Monet | wc -l

!ls original_data/Caravaggio | wc -l
!ls oversampled_data/Caravaggio | wc -l

Albrecht_Dürer	Frida_Kahlo	 Leonardo_da_Vinci  Peter_Paul_Rubens	   Rembrandt
Andy_Warhol	Gustav_Klimt	 Michelangelo	    Pierre-Auguste_Renoir  Salvador_Dali
Caravaggio	Jackson_Pollock  Pablo_Picasso	    Piet_Mondrian	   Sandro_Botticelli
Claude_Monet	Jan_van_Eyck	 Paul_Gauguin	    Raphael		   Vincent_van_Gogh
20
73
201
55
200
