In [1]:
import os
import tarfile
import urllib.request
import random
import shutil

# Function to download, extract, create a subset, and compress it
def download_and_extract_dataset(dataset_url, tar_file_path, dataset_path, num_images_per_class=60):
    if not os.path.exists(tar_file_path):
        print(f"Downloading the dataset from {dataset_url}...")
        urllib.request.urlretrieve(dataset_url, tar_file_path)
        print("Download complete!")

    print("Extracting the dataset...")
    with tarfile.open(tar_file_path, 'r') as tar_ref:
        tar_ref.extractall(path=dataset_path)
    print("Extraction complete!")

    print("Creating a subset...")
    subset_path = os.path.join(dataset_path, 'subset')
    os.makedirs(subset_path, exist_ok=True)

    # Adjusting for the correct path within the extracted directory structure
    dataset_path = os.path.join(dataset_path, 'Images')

    breed_dirs = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]

    for breed_dir in breed_dirs:
        breed_path = os.path.join(dataset_path, breed_dir)
        images = [img for img in os.listdir(breed_path) if os.path.isfile(os.path.join(breed_path, img))]
        selected_images = random.sample(images, min(len(images), num_images_per_class))

        target_breed_dir = os.path.join(subset_path, breed_dir)
        os.makedirs(target_breed_dir, exist_ok=True)

        for image in selected_images:
            src = os.path.join(breed_path, image)
            dst = os.path.join(target_breed_dir, image)
            shutil.copy(src, dst)
    print("Subset created.")

    # Compress the subset directory
    compressed_subset_name = os.path.join(subset_path, 'subset.zip')
    shutil.make_archive(subset_path, 'zip', subset_path)
    print(f"Compressed subset directory into {compressed_subset_name}")

    # Users should manually download the subset or you can automate this if needed
    print(f"You can find the compressed subset at: {compressed_subset_name}")

# Specify dataset URL, path to save the .tar file, and the path for dataset extraction
dataset_url = "http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar"
tar_file_path = "images.tar"
dataset_path = "images"

# Download, extract, create a subset, and compress it
download_and_extract_dataset(dataset_url, tar_file_path, dataset_path)

Downloading the dataset from http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar...
Download complete!
Extracting the dataset...
Extraction complete!
Creating a subset...
Subset created.
Compressed subset directory into images/subset/subset.zip
You can find the compressed subset at: images/subset/subset.zip
