In [198]:
import os
import pandas as pd
import numpy as np
import random
import shutil

# Set wordking directory:
os.chdir(r"D:\GitHub\Datasets\plant-pathology-2021-fgvc8")

train_images_path = os.path.join(os.getcwd(), "train_images")

train_file = pd.read_csv("train.csv")
labels = ['scab', 'rust', 'frog_eye_leaf_spot', 'powdery_mildew', 'healthy', 'complex']

for i in np.arange(len(labels)):
    train_file[labels[i]] = np.where(train_file["labels"].str.contains(labels[i]), 1, 0)
train_file = train_file.drop(["labels"], axis = 1)
train_file["index"] = train_file.index

train_folder = os.path.join(os.getcwd(), "train")
validation_folder = os.path.join(os.getcwd(), "validation")

# Train folder:
if os.path.exists(train_folder) == False:
    print("-> Create train directory")
    os.mkdir(train_folder)
    for i in np.arange(len(labels)):
        print("--> Create", labels[i], "directory")
        os.mkdir(os.path.join(train_folder, labels[i]))
else:
    print("Train directory exists")
        
# Validation_folder:
if os.path.exists(validation_folder) == False:
    print("-> Create validation directory")
    os.mkdir(validation_folder)    
    for i in np.arange(len(labels)):
        print("--> Create", labels[i], "directory")
        os.mkdir(os.path.join(validation_folder, labels[i]))
else:
    print("Validation directory exists")    

np.random.seed(seed = 42)
split_ratio = 0.8
train_id = np.random.choice(np.arange(train_file.shape[0]), size = int(train_file.shape[0] * split_ratio), replace = False)
validation_id = np.setdiff1d(np.arange(train_file.shape[0]), train_id)
    
train = train_file[train_file["index"].isin(train_id)]
validation = train_file[train_file["index"].isin(validation_id)]

# Prepare train data:
print("-> Prepare train data folder")
for i in np.arange(len(labels)):
    print("--> Train label processed:", labels[i])
        
    temp_df = train[train[labels[i]] == 1]
    files = temp_df["image"].tolist()

    for j in np.arange(len(files)):
        shutil.copy(os.path.join(train_images_path, files[j]), os.path.join(train_folder, labels[i], files[j]))
        
# Prepare validation data:
print("-> Prepare validation data folder")
for i in np.arange(len(labels)):
    print("--> Validation label processed:", labels[i])
        
    temp_df = validation[validation[labels[i]] == 1]
    files = temp_df["image"].tolist()

    for j in np.arange(len(files)):
        shutil.copy(os.path.join(train_images_path, files[j]), os.path.join(validation_folder, labels[i], files[j]))
        
# Check files per class distribution:
folders = ["train", "validation"]
for i in np.arange(len(folders)):
    
    print("->", folders[i])
    files = []
    subfolders = os.listdir(os.path.join(os.getcwd(), folders[i]))
    
    for j in np.arange(len(subfolders)):
        files.append(len(os.listdir(os.path.join(os.getcwd(), folders[i], subfolders[j]))))
    
    print("--> All", folders[i], "files:", np.sum(files))
    
    for j in np.arange(len(subfolders)):
        files_ = len(os.listdir(os.path.join(os.getcwd(), folders[i], subfolders[j])))
        print("---> Files in", folders[i], "folder for label", subfolders[j], "-", str(files_), "-", str(np.round(files_/np.sum(files), 2)), "%")
        
    del files

-> Create train directory
--> Create scab directory
--> Create rust directory
--> Create frog_eye_leaf_spot directory
--> Create powdery_mildew directory
--> Create healthy directory
--> Create complex directory
-> Create validation directory
--> Create scab directory
--> Create rust directory
--> Create frog_eye_leaf_spot directory
--> Create powdery_mildew directory
--> Create healthy directory
--> Create complex directory
-> Prepare train data folder
--> Train label processed: scab
--> Train label processed: rust
--> Train label processed: frog_eye_leaf_spot
--> Train label processed: powdery_mildew
--> Train label processed: healthy
--> Train label processed: complex
-> Prepare validation data folder
--> Validation label processed: scab
--> Validation label processed: rust
--> Validation label processed: frog_eye_leaf_spot
--> Validation label processed: powdery_mildew
--> Validation label processed: healthy
--> Validation label processed: complex
-> train
--> All train files: 1617