# Prepare Data

***Load dataset***

In [1]:
# kaggle dataset used = CatBreedsRefined-7k
# https://www.kaggle.com/datasets/doctrinek/catbreedsrefined-7k

#load dataset
!pip install kaggle
!mkdir ./data
!kaggle datasets download -d doctrinek/catbreedsrefined-7k -p ./data

# unzip dataset
!unzip ./data/catbreedsrefined-7k.zip -d ./data

# clean up by removing file no longer required
!rm ./data/catbreedsrefined-7k.zip

***Split dataset into tran, validation and test datasets***

In [4]:
import os
import shutil
import random

# Set the paths
original_dataset_path = "./data/CatBreedsRefined-v2"
train_dataset_path = "./data/train"
valid_dataset_path = "./data/validation"
test_dataset_path = "./data/test"

# Create the directories if they don't exist
os.makedirs(train_dataset_path, exist_ok=True)
os.makedirs(valid_dataset_path, exist_ok=True)
os.makedirs(test_dataset_path, exist_ok=True)

# Get the list of folders in the original dataset
folders = os.listdir(original_dataset_path)

for folder in folders:
    os.makedirs(os.path.join(train_dataset_path, folder), exist_ok=True)
    os.makedirs(os.path.join(valid_dataset_path, folder), exist_ok=True)
    os.makedirs(os.path.join(test_dataset_path, folder), exist_ok=True)

# Iterate over each folder
for folder in folders:
    folder_path = os.path.join(original_dataset_path, folder)
    files = os.listdir(folder_path)

    # Shuffle the files randomly
    random.shuffle(files)

    # Calculate the split indices
    train_split_index = int(0.6 * len(files))
    valid_split_index = int(0.8 * len(files))

    # Split the files into train, validation, and test datasets
    train_files = files[:train_split_index]
    valid_files = files[train_split_index:valid_split_index]
    test_files = files[valid_split_index:]

    # Move the files to the respective directories
    for file in train_files:
        src = os.path.join(folder_path, file)
        dst = os.path.join(train_dataset_path, folder, file)
        shutil.copyfile(src, dst)

    for file in valid_files:
        src = os.path.join(folder_path, file)
        dst = os.path.join(valid_dataset_path, folder, file)
        shutil.copyfile(src, dst)

    for file in test_files:
        src = os.path.join(folder_path, file)
        dst = os.path.join(test_dataset_path, folder, file)
        shutil.copyfile(src, dst)


In [5]:
# clean up by removing files no longer required
!rm -r ./data/CatBreedsRefined-v2