In [1]:
import torch
import torchvision.transforms as transforms
import os
from torchvision.datasets import ImageFolder
from torchvision.transforms import Normalize
import random
from PIL import Image
import wandb
import shutil

In [2]:
import os
import random
from PIL import Image
from torchvision import transforms

def apply_random_transformations(image, random_transforms):
    """Applies random transformations to the input image.

    Args:
        image (PIL.Image): The input image.
        random_transforms (list): List of random transformations to apply.

    Returns:
        PIL.Image: The transformed image.
    """
    transform = transforms.Compose(random_transforms)
    return transform(image)

def augment_images(dataset_dir: str, output_dir: str, num_transformations=100):
    """Function to augment images in the dataset and save them in class-specific folders.

    Args:
        dataset_dir (str): The directory containing the images to augment.
        output_dir (str): The directory to save the augmented images to.
        num_transformations (int, optional): The number of images to transform for each image. Defaults to 100.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Additional transformations for augmentation
    additional_transforms = [
        transforms.RandomApply([
            transforms.Grayscale(num_output_channels=3),  # Convert to grayscale (3 channels)
        ], p=0.5),
    ]
    
    # Define the transformations for augmentation
    augment_transform = [
        transforms.Resize((256, 256)),
        transforms.RandomChoice([
            transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),  # Zoom in
            transforms.RandomCrop(size=(224, 224), padding=20),  # Zoom out
        ]),
        transforms.RandomApply([
            transforms.ColorJitter(
                brightness=random.uniform(0.1, 0.5),
                contrast=random.uniform(0.1, 0.5),
                saturation=random.uniform(0.1, 0.5),
                hue=random.uniform(0.1, 0.5),
            ),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(degrees=random.uniform(10, 20)),
        ], p=0.5),
        *additional_transforms,
    ]

    # Walk through the directory tree (including subdirectories)
    for root, _, filenames in os.walk(dataset_dir):
        for filename in filenames:
            # Extract the coin name from the filename
            coin_name = os.path.splitext(filename)[0]

            # Create a folder for the coin name in the output directory if it doesn't exist
            coin_folder = os.path.join(output_dir, os.path.relpath(root, dataset_dir), coin_name)
            os.makedirs(coin_folder, exist_ok=True)

            # Load the image
            image_path = os.path.join(root, filename)
            with Image.open(image_path).convert("RGB") as image:
                # Apply random transformations to the original image and save augmented images
                for j in range(num_transformations):
                    # Generate a new filename for the augmented image with the label included
                    augmented_filename = f"{coin_name}_augmented_{j}.jpg"

                    # Determine the output path for the augmented image
                    augmented_filepath = os.path.join(coin_folder, augmented_filename)

                    # Apply random transformations
                    random_transformations = random.sample(augment_transform, random.randint(1, len(augment_transform)))
                    augmented_image = apply_random_transformations(image, random_transformations)

                    # Save the augmented image
                    augmented_image.save(augmented_filepath)

    print("Augmentation complete.")

dataset_dir = "/workspaces/AICoinXpert/algo/webscraping/data/selected_coins_above20"
augment_images(dataset_dir=dataset_dir, output_dir="/workspaces/AICoinXpert/algo/webscraping/data/pool_20_above_filtered", num_transformations=500)


ValueError: Required crop size (224, 224) is larger than input image size (180, 180)

In [2]:
import os
import random
import shutil
from typing import List

def split_images(main_directory: str, output_directory: str, train_ratio: float = 0.7, test_ratio: float = 0.2):
    """
    Split the images from main_directory into train, test, and eval sets and organize them in output_directory.

    Args:
        main_directory (str): The directory containing the class folders.
        output_directory (str): The directory to save the train, test, and eval folders.
        train_ratio (float, optional): The ratio of images to include in the train set. Defaults to 0.7.
        test_ratio (float, optional): The ratio of images to include in the test set. Defaults to 0.2.
    """
    # Create train, test, and eval folders in the output directory
    train_dir = os.path.join(output_directory, "train")
    test_dir = os.path.join(output_directory, "test")
    eval_dir = os.path.join(output_directory, "eval")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)

    # Get a list of class folders in the main directory
    class_folders = [folder for folder in os.listdir(main_directory) if os.path.isdir(os.path.join(main_directory, folder))]

    # Iterate through each class folder
    for class_folder in class_folders:
        class_directory = os.path.join(main_directory, class_folder)

        # Get the list of image filenames in the class folder
        image_filenames = os.listdir(class_directory)

        # Shuffle the image filenames
        random.shuffle(image_filenames)

        # Calculate the number of images for each split
        total_images = len(image_filenames)
        num_train = int(total_images * train_ratio)
        num_test = int(total_images * test_ratio)
        num_eval = total_images - num_train - num_test

        # Create the output directories for the current class in train, test, and eval folders
        train_class_dir = os.path.join(train_dir, class_folder)
        test_class_dir = os.path.join(test_dir, class_folder)
        eval_class_dir = os.path.join(eval_dir, class_folder)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)
        os.makedirs(eval_class_dir, exist_ok=True)

        # Move images to the respective folders based on the split ratio
        for i, filename in enumerate(image_filenames):
            source_filepath = os.path.join(class_directory, filename)

            if i < num_train:
                destination_filepath = os.path.join(train_class_dir, filename)
            elif i < num_train + num_test:
                destination_filepath = os.path.join(test_class_dir, filename)
            else:
                destination_filepath = os.path.join(eval_class_dir, filename)

            shutil.move(source_filepath, destination_filepath)

    print("Splitting and organizing images complete.")

# Usage example:
split_images(main_directory="/workspaces/AICoinXpert/algo/webscraping/data/pool_20_above_filtered", output_directory="/workspaces/AICoinXpert/algo/webscraping/data/organized_images_20_above_filtered")


Splitting and organizing images complete.


In [3]:
import os

def count_folders_and_images(directory):
    folder_count = 0
    image_count = 0

    for root, dirs, files in os.walk(directory):
        folder_count += len(dirs)
        image_count += len(files)

    return folder_count, image_count

# Example usage:
output_directory = "/workspaces/AICoinXpert/algo/webscraping/data/organized_images_20_above_filtered"
train_folder_count, train_image_count = count_folders_and_images(os.path.join(output_directory, "train"))
test_folder_count, test_image_count = count_folders_and_images(os.path.join(output_directory, "test"))
eval_folder_count, eval_image_count = count_folders_and_images(os.path.join(output_directory, "eval"))

print(f"Train folders: {train_folder_count}, Train images: {train_image_count}")
print(f"Test folders: {test_folder_count}, Test images: {test_image_count}")
print(f"Eval folders: {eval_folder_count}, Eval images: {eval_image_count}")


Train folders: 200, Train images: 69800
Test folders: 199, Test images: 19800
Eval folders: 200, Eval images: 10400


In [4]:
import os
import random
import shutil

def reduce_dataset(input_dir, output_dir, percentage):
    # Create the output directories if they don't exist
    os.makedirs(os.path.join(output_dir, 'train'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'test'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'eval'), exist_ok=True)

    # Iterate through the input directories
    for split_dir in os.listdir(input_dir):
        # Create the output directory for the current split
        os.makedirs(os.path.join(output_dir, split_dir), exist_ok=True)

        # Iterate through the subdirectories in the current split
        for class_dir in os.listdir(os.path.join(input_dir, split_dir)):
            # Create the output directory for the current class
            os.makedirs(os.path.join(output_dir, split_dir, class_dir), exist_ok=True)

            # Get the list of image filenames in the current class directory
            image_filenames = os.listdir(os.path.join(input_dir, split_dir, class_dir))

            # Calculate the number of images to select based on the percentage
            num_images = int(len(image_filenames) * percentage)

            # Randomly select the specified number of images
            selected_images = random.sample(image_filenames, num_images)

            # Copy the selected images to the output directory for the current class
            for filename in selected_images:
                src_path = os.path.join(input_dir, split_dir, class_dir, filename)
                dst_path = os.path.join(output_dir, split_dir, class_dir, filename)
                shutil.copy(src_path, dst_path)

In [None]:
reduce_dataset("/workspaces/AICoinXpert/algo/webscraping/data/organized_images_20_above_filtered", "/workspaces/AICoinXpert/algo/webscraping/data/", 0.5)